From c6e5ca35c4685cd920b1d5279dbc9f4483d7dfd4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:43:55 +0200
Subject: x86/asm/tsc: Inline native_read_tsc() and remove __native_read_tsc()

In the following commit:

  cdc7957d1954 ("x86: move native_read_tsc() offline")

... native_read_tsc() was moved out of line, presumably for some
now-obsolete vDSO-related reason. Undo it.

The entire rdtsc, shl, or sequence is only 11 bytes, and calls
via rdtscl() and similar helpers were already inlined.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/d05ffe2aaf8468ca475ebc00efad7b2fa174af19.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apb_timer.c | 4 ++--
 arch/x86/kernel/tsc.c       | 6 ------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index ede92c3364d3..9fe111cc50f8 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
 	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 
-	t1 = __native_read_tsc();
+	t1 = native_read_tsc();
 
 	do {
 		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 
-	t2 = __native_read_tsc();
+	t2 = native_read_tsc();
 
 	shift = 5;
 	if (unlikely(loop >> shift == 0)) {
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 505449700e0c..e7710cd7ba00 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -308,12 +308,6 @@ unsigned long long
 sched_clock(void) __attribute__((alias("native_sched_clock")));
 #endif
 
-unsigned long long native_read_tsc(void)
-{
-	return __native_read_tsc();
-}
-EXPORT_SYMBOL(native_read_tsc);
-
 int check_tsc_unstable(void)
 {
 	return tsc_unstable;
-- 
cgit v1.2.3-70-g09d2


From 9261e050b686c9fe229cd9918d997b3caaf20e34 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:43:57 +0200
Subject: x86/asm/tsc, x86/paravirt: Remove read_tsc() and read_tscp() paravirt
 hooks

We've had ->read_tsc() and ->read_tscp() paravirt hooks since
the very beginning of paravirt, i.e.,

  d3561b7fa0fb ("[PATCH] paravirt: header and stubs for paravirtualisation").

AFAICT, the only paravirt guest implementation that ever
replaced these calls was vmware, and it's gone. Arguably even
vmware shouldn't have hooked RDTSC -- we fully support systems
that don't have a TSC at all, so there's no point for a paravirt
implementation to pretend that we have a TSC but to replace it.

I also doubt that these hooks actually worked. Calls to rdtscl()
and rdtscll(), which respected the hooks, were used seemingly
interchangeably with native_read_tsc(), which did not.

Just remove them. If anyone ever needs them again, they can try
to make a case for why they need them.

Before, on a paravirt config:
  text    	data     bss     dec     hex filename
  12618257      1816384 1093632 15528273 ecf151 vmlinux

After:
  text		data     bss     dec     hex filename
  12617207      1816384 1093632 15527223 eced37 vmlinux

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: virtualization@lists.linux-foundation.org
Link: http://lkml.kernel.org/r/d08a2600fb298af163681e5efd8e599d889a5b97.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr.h            | 16 ++++++++--------
 arch/x86/include/asm/paravirt.h       | 34 ----------------------------------
 arch/x86/include/asm/paravirt_types.h |  2 --
 arch/x86/kernel/paravirt.c            |  2 --
 arch/x86/kernel/paravirt_patch_32.c   |  2 --
 arch/x86/xen/enlighten.c              |  3 ---
 6 files changed, 8 insertions(+), 51 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 88711470af7f..d1afac7df484 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -178,12 +178,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 }
 
-#define rdtscl(low)						\
-	((low) = (u32)native_read_tsc())
-
-#define rdtscll(val)						\
-	((val) = native_read_tsc())
-
 #define rdpmc(counter, low, high)			\
 do {							\
 	u64 _l = native_read_pmc((counter));		\
@@ -193,6 +187,14 @@ do {							\
 
 #define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
 
+#endif	/* !CONFIG_PARAVIRT */
+
+#define rdtscl(low)						\
+	((low) = (u32)native_read_tsc())
+
+#define rdtscll(val)						\
+	((val) = native_read_tsc())
+
 #define rdtscp(low, high, aux)					\
 do {                                                            \
 	unsigned long long _val = native_read_tscp(&(aux));     \
@@ -202,8 +204,6 @@ do {                                                            \
 
 #define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
 
-#endif	/* !CONFIG_PARAVIRT */
-
 /*
  * 64-bit version of wrmsr_safe():
  */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index d143bfad45d7..c2be0375bcad 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -174,19 +174,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 }
 
-static inline u64 paravirt_read_tsc(void)
-{
-	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
-}
-
-#define rdtscl(low)				\
-do {						\
-	u64 _l = paravirt_read_tsc();		\
-	low = (int)_l;				\
-} while (0)
-
-#define rdtscll(val) (val = paravirt_read_tsc())
-
 static inline unsigned long long paravirt_sched_clock(void)
 {
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
@@ -215,27 +202,6 @@ do {						\
 
 #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
 
-static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
-{
-	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
-}
-
-#define rdtscp(low, high, aux)				\
-do {							\
-	int __aux;					\
-	unsigned long __val = paravirt_rdtscp(&__aux);	\
-	(low) = (u32)__val;				\
-	(high) = (u32)(__val >> 32);			\
-	(aux) = __aux;					\
-} while (0)
-
-#define rdtscpll(val, aux)				\
-do {							\
-	unsigned long __aux; 				\
-	val = paravirt_rdtscp(&__aux);			\
-	(aux) = __aux;					\
-} while (0)
-
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
 	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index a6b8f9fadb06..ce029e4fa7c6 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -156,9 +156,7 @@ struct pv_cpu_ops {
 	u64 (*read_msr)(unsigned int msr, int *err);
 	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
 
-	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(int counter);
-	unsigned long long (*read_tscp)(unsigned int *aux);
 
 #ifdef CONFIG_X86_32
 	/*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 58bcfb67c01f..f68e48f5f6c2 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -351,9 +351,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
-	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.read_tscp = native_read_tscp,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index e1b013696dde..c89f50a76e97 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
@@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
-		PATCH_SITE(pv_cpu_ops, read_tsc);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
 			if (pv_is_native_spin_unlock()) {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0b95c9b8283f..32136bfca43f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1175,11 +1175,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.read_msr = xen_read_msr_safe,
 	.write_msr = xen_write_msr_safe,
 
-	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
 
-	.read_tscp = native_read_tscp,
-
 	.iret = xen_iret,
 #ifdef CONFIG_X86_64
 	.usergs_sysret32 = xen_sysret32,
-- 
cgit v1.2.3-70-g09d2


From 87be28aaf1458445d5f648688c2eec0f13b8f3b9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:43:58 +0200
Subject: x86/asm/tsc: Replace rdtscll() with native_read_tsc()

Now that the ->read_tsc() paravirt hook is gone, rdtscll() is
just a wrapper around native_read_tsc(). Unwrap it.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/d2449ae62c1b1fb90195bcfb19ef4a35883a04dc.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/aslr.c                      | 2 +-
 arch/x86/include/asm/msr.h                           | 3 ---
 arch/x86/include/asm/tsc.h                           | 5 +----
 arch/x86/kernel/apb_timer.c                          | 4 ++--
 arch/x86/kernel/apic/apic.c                          | 8 ++++----
 arch/x86/kernel/cpu/mcheck/mce.c                     | 4 ++--
 arch/x86/kernel/espfix_64.c                          | 2 +-
 arch/x86/kernel/hpet.c                               | 4 ++--
 arch/x86/kernel/trace_clock.c                        | 2 +-
 arch/x86/kernel/tsc.c                                | 4 ++--
 arch/x86/kvm/vmx.c                                   | 2 +-
 arch/x86/lib/delay.c                                 | 2 +-
 drivers/thermal/intel_powerclamp.c                   | 4 ++--
 tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c | 4 ++--
 14 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index d7b1f655b3ef..ea33236190b1 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
 
 	if (has_cpuflag(X86_FEATURE_TSC)) {
 		debug_putstr(" RDTSC");
-		rdtscll(raw);
+		raw = native_read_tsc();
 
 		random ^= raw;
 		use_i8254 = false;
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index d1afac7df484..7273b74e0f99 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -192,9 +192,6 @@ do {							\
 #define rdtscl(low)						\
 	((low) = (u32)native_read_tsc())
 
-#define rdtscll(val)						\
-	((val) = native_read_tsc())
-
 #define rdtscp(low, high, aux)					\
 do {                                                            \
 	unsigned long long _val = native_read_tscp(&(aux));     \
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 3da1cc1218ac..b4883902948b 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -21,15 +21,12 @@ extern void disable_TSC(void);
 
 static inline cycles_t get_cycles(void)
 {
-	unsigned long long ret = 0;
-
 #ifndef CONFIG_X86_TSC
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	rdtscll(ret);
 
-	return ret;
+	return native_read_tsc();
 }
 
 extern void tsc_init(void);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 9fe111cc50f8..25efa534c4e4 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)
 
 	/* Verify whether apbt counter works */
 	t1 = dw_apb_clocksource_read(clocksource_apbt);
-	rdtscll(start);
+	start = native_read_tsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = native_read_tsc();
 	} while ((now - start) < 200000UL);
 
 	/* APBT is the only always on clocksource, it has to work! */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dcb52850a28f..51af1ed1ae2e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -457,7 +457,7 @@ static int lapic_next_deadline(unsigned long delta,
 {
 	u64 tsc;
 
-	rdtscll(tsc);
+	tsc = native_read_tsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
 }
@@ -592,7 +592,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	unsigned long pm = acpi_pm_read_early();
 
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = native_read_tsc();
 
 	switch (lapic_cal_loops++) {
 	case 0:
@@ -1209,7 +1209,7 @@ void setup_local_APIC(void)
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;
 
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = native_read_tsc();
 
 	if (disable_apic) {
 		disable_ioapic_support();
@@ -1293,7 +1293,7 @@ void setup_local_APIC(void)
 		}
 		if (queued) {
 			if (cpu_has_tsc && cpu_khz) {
-				rdtscll(ntsc);
+				ntsc = native_read_tsc();
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 			} else
 				max_loops--;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index df919ff103c3..a5283d2d0094 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -125,7 +125,7 @@ void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
-	rdtscll(m->tsc);
+	m->tsc = native_read_tsc();
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -1784,7 +1784,7 @@ static void collect_tscs(void *data)
 {
 	unsigned long *cpu_tsc = (unsigned long *)data;
 
-	rdtscll(cpu_tsc[smp_processor_id()]);
+	cpu_tsc[smp_processor_id()] = native_read_tsc();
 }
 
 static int mce_apei_read_done;
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730e7b08..334a2a9c034d 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -110,7 +110,7 @@ static void init_espfix_random(void)
 	 */
 	if (!arch_get_random_long(&rand)) {
 		/* The constant is an arbitrary large prime */
-		rdtscll(rand);
+		rand = native_read_tsc();
 		rand *= 0xc345c6b72fd16123UL;
 	}
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 10757d0a3fcf..cc390fe69b71 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -735,7 +735,7 @@ static int hpet_clocksource_register(void)
 
 	/* Verify whether hpet counter works */
 	t1 = hpet_readl(HPET_COUNTER);
-	rdtscll(start);
+	start = native_read_tsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -745,7 +745,7 @@ static int hpet_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = native_read_tsc();
 	} while ((now - start) < 200000UL);
 
 	if (t1 == hpet_readl(HPET_COUNTER)) {
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index 25b993729f9b..bd8f4d41bd56 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -15,7 +15,7 @@ u64 notrace trace_clock_x86_tsc(void)
 	u64 ret;
 
 	rdtsc_barrier();
-	rdtscll(ret);
+	ret = native_read_tsc();
 
 	return ret;
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index e7710cd7ba00..e66f5dcaeb63 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 	data = cyc2ns_write_begin(cpu);
 
-	rdtscll(tsc_now);
+	tsc_now = native_read_tsc();
 	ns_now = cycles_2_ns(tsc_now);
 
 	/*
@@ -290,7 +290,7 @@ u64 native_sched_clock(void)
 	}
 
 	/* read the Time Stamp Counter: */
-	rdtscll(tsc_now);
+	tsc_now = native_read_tsc();
 
 	/* return the value in ns */
 	return cycles_2_ns(tsc_now);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e856dd566f4c..4fa1ccad7beb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void)
 {
 	u64 host_tsc, tsc_offset;
 
-	rdtscll(host_tsc);
+	host_tsc = native_read_tsc();
 	tsc_offset = vmcs_read64(TSC_OFFSET);
 	return host_tsc + tsc_offset;
 }
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 39d6a3db0b96..9a52ad0c0758 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -100,7 +100,7 @@ void use_tsc_delay(void)
 int read_current_timer(unsigned long *timer_val)
 {
 	if (delay_fn == delay_tsc) {
-		rdtscll(*timer_val);
+		*timer_val = native_read_tsc();
 		return 0;
 	}
 	return -1;
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 5820e8513927..ab13448defcf 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -340,7 +340,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
 
 	/* check result for the last window */
 	msr_now = pkg_state_counter();
-	rdtscll(tsc_now);
+	tsc_now = native_read_tsc();
 
 	/* calculate pkg cstate vs tsc ratio */
 	if (!msr_last || !tsc_last)
@@ -482,7 +482,7 @@ static void poll_pkg_cstate(struct work_struct *dummy)
 	u64 val64;
 
 	msr_now = pkg_state_counter();
-	rdtscll(tsc_now);
+	tsc_now = native_read_tsc();
 	jiffies_now = jiffies;
 
 	/* calculate pkg cstate vs tsc ratio */
diff --git a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
index 5224ee5b392d..f02b0c0bff9b 100644
--- a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
+++ b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
@@ -81,11 +81,11 @@ static int __init cpufreq_test_tsc(void)
 
 	printk(KERN_DEBUG "start--> \n");
 	then = read_pmtmr();
-        rdtscll(then_tsc);
+	then_tsc = native_read_tsc();
 	for (i=0;i<20;i++) {
 		mdelay(100);
 		now = read_pmtmr();
-		rdtscll(now_tsc);
+		now_tsc = native_read_tsc();
 		diff = (now - then) & 0xFFFFFF;
 		diff_tsc = now_tsc - then_tsc;
 		printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);
-- 
cgit v1.2.3-70-g09d2


From 3796366614598e48edf0561b86f18c230a7debc8 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:01 +0200
Subject: x86/asm/tsc, x86/cpu/amd: Use the full 64-bit TSC to detect the 2.6.2
 bug

This code is timing 100k indirect calls, so the added overhead
of counting the number of cycles elapsed as a 64-bit number
should be insignificant.  Drop the optimization of using a
32-bit count.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/d58f339a9c0dd8352b50d2f7a216f67ec2844f20.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dd3a4baffe50..a69710db6112 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -114,7 +114,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		const int K6_BUG_LOOP = 1000000;
 		int n;
 		void (*f_vide)(void);
-		unsigned long d, d2;
+		u64 d, d2;
 
 		printk(KERN_INFO "AMD K6 stepping B detected - ");
 
@@ -125,10 +125,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 
 		n = K6_BUG_LOOP;
 		f_vide = vide;
-		rdtscl(d);
+		d = native_read_tsc();
 		while (n--)
 			f_vide();
-		rdtscl(d2);
+		d2 = native_read_tsc();
 		d = d2-d;
 
 		if (d > 20*K6_BUG_LOOP)
-- 
cgit v1.2.3-70-g09d2


From 4ea1636b04dbd66536fa387bae2eea463efc705b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:07 +0200
Subject: x86/asm/tsc: Rename native_read_tsc() to rdtsc()

Now that there is no paravirt TSC, the "native" is
inappropriate. The function does RDTSC, so give it the obvious
name: rdtsc().

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/fd43e16281991f096c1e4d21574d9e1402c62d39.1434501121.git.luto@kernel.org
[ Ported it to v4.2-rc1. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/aslr.c                      |  2 +-
 arch/x86/entry/vdso/vclock_gettime.c                 |  2 +-
 arch/x86/include/asm/msr.h                           | 11 ++++++++++-
 arch/x86/include/asm/pvclock.h                       |  2 +-
 arch/x86/include/asm/stackprotector.h                |  2 +-
 arch/x86/include/asm/tsc.h                           |  2 +-
 arch/x86/kernel/apb_timer.c                          |  8 ++++----
 arch/x86/kernel/apic/apic.c                          |  8 ++++----
 arch/x86/kernel/cpu/amd.c                            |  4 ++--
 arch/x86/kernel/cpu/mcheck/mce.c                     |  4 ++--
 arch/x86/kernel/espfix_64.c                          |  2 +-
 arch/x86/kernel/hpet.c                               |  4 ++--
 arch/x86/kernel/trace_clock.c                        |  2 +-
 arch/x86/kernel/tsc.c                                |  4 ++--
 arch/x86/kvm/lapic.c                                 |  4 ++--
 arch/x86/kvm/svm.c                                   |  4 ++--
 arch/x86/kvm/vmx.c                                   |  4 ++--
 arch/x86/kvm/x86.c                                   | 12 ++++++------
 arch/x86/lib/delay.c                                 |  8 ++++----
 drivers/cpufreq/intel_pstate.c                       |  2 +-
 drivers/input/gameport/gameport.c                    |  4 ++--
 drivers/input/joystick/analog.c                      |  4 ++--
 drivers/net/hamradio/baycom_epp.c                    |  2 +-
 drivers/thermal/intel_powerclamp.c                   |  4 ++--
 tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c |  4 ++--
 25 files changed, 59 insertions(+), 50 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index ea33236190b1..6a9b96b4624d 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
 
 	if (has_cpuflag(X86_FEATURE_TSC)) {
 		debug_putstr(" RDTSC");
-		raw = native_read_tsc();
+		raw = rdtsc();
 
 		random ^= raw;
 		use_i8254 = false;
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 972b488ac16a..0340d93c18ca 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -186,7 +186,7 @@ notrace static cycle_t vread_tsc(void)
 	 * but no one has ever seen it happen.
 	 */
 	rdtsc_barrier();
-	ret = (cycle_t)native_read_tsc();
+	ret = (cycle_t)rdtsc();
 
 	last = gtod->cycle_last;
 
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c89ed6ceed02..ff0c120dafe5 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -109,7 +109,16 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
 extern int rdmsr_safe_regs(u32 regs[8]);
 extern int wrmsr_safe_regs(u32 regs[8]);
 
-static __always_inline unsigned long long native_read_tsc(void)
+/**
+ * rdtsc() - returns the current TSC without ordering constraints
+ *
+ * rdtsc() returns the result of RDTSC as a 64-bit integer.  The
+ * only ordering constraint it supplies is the ordering implied by
+ * "asm volatile": it will put the RDTSC in the place you expect.  The
+ * CPU can and will speculatively execute that RDTSC, though, so the
+ * results can be non-monotonic if compared on different CPUs.
+ */
+static __always_inline unsigned long long rdtsc(void)
 {
 	DECLARE_ARGS(val, low, high);
 
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 2bd69d62c623..5c490db62e32 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 static __always_inline
 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
 {
-	u64 delta = native_read_tsc() - src->tsc_timestamp;
+	u64 delta = rdtsc() - src->tsc_timestamp;
 	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
 				   src->tsc_shift);
 }
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index bc5fa2af112e..58505f01962f 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void)
 	 * on during the bootup the random pool has true entropy too.
 	 */
 	get_random_bytes(&canary, sizeof(canary));
-	tsc = native_read_tsc();
+	tsc = rdtsc();
 	canary += tsc + (tsc << 32UL);
 
 	current->stack_canary = canary;
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index b4883902948b..3df7675debcf 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -26,7 +26,7 @@ static inline cycles_t get_cycles(void)
 		return 0;
 #endif
 
-	return native_read_tsc();
+	return rdtsc();
 }
 
 extern void tsc_init(void);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 25efa534c4e4..222a57076039 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)
 
 	/* Verify whether apbt counter works */
 	t1 = dw_apb_clocksource_read(clocksource_apbt);
-	start = native_read_tsc();
+	start = rdtsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		now = native_read_tsc();
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 
 	/* APBT is the only always on clocksource, it has to work! */
@@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
 	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 
-	t1 = native_read_tsc();
+	t1 = rdtsc();
 
 	do {
 		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 
-	t2 = native_read_tsc();
+	t2 = rdtsc();
 
 	shift = 5;
 	if (unlikely(loop >> shift == 0)) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 51af1ed1ae2e..0d71cd9b4a50 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -457,7 +457,7 @@ static int lapic_next_deadline(unsigned long delta,
 {
 	u64 tsc;
 
-	tsc = native_read_tsc();
+	tsc = rdtsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
 }
@@ -592,7 +592,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	unsigned long pm = acpi_pm_read_early();
 
 	if (cpu_has_tsc)
-		tsc = native_read_tsc();
+		tsc = rdtsc();
 
 	switch (lapic_cal_loops++) {
 	case 0:
@@ -1209,7 +1209,7 @@ void setup_local_APIC(void)
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;
 
 	if (cpu_has_tsc)
-		tsc = native_read_tsc();
+		tsc = rdtsc();
 
 	if (disable_apic) {
 		disable_ioapic_support();
@@ -1293,7 +1293,7 @@ void setup_local_APIC(void)
 		}
 		if (queued) {
 			if (cpu_has_tsc && cpu_khz) {
-				ntsc = native_read_tsc();
+				ntsc = rdtsc();
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 			} else
 				max_loops--;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a69710db6112..51ad2af84a72 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -125,10 +125,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 
 		n = K6_BUG_LOOP;
 		f_vide = vide;
-		d = native_read_tsc();
+		d = rdtsc();
 		while (n--)
 			f_vide();
-		d2 = native_read_tsc();
+		d2 = rdtsc();
 		d = d2-d;
 
 		if (d > 20*K6_BUG_LOOP)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a5283d2d0094..96cceccd11b4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -125,7 +125,7 @@ void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
-	m->tsc = native_read_tsc();
+	m->tsc = rdtsc();
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -1784,7 +1784,7 @@ static void collect_tscs(void *data)
 {
 	unsigned long *cpu_tsc = (unsigned long *)data;
 
-	cpu_tsc[smp_processor_id()] = native_read_tsc();
+	cpu_tsc[smp_processor_id()] = rdtsc();
 }
 
 static int mce_apei_read_done;
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 334a2a9c034d..67315cd0132c 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -110,7 +110,7 @@ static void init_espfix_random(void)
 	 */
 	if (!arch_get_random_long(&rand)) {
 		/* The constant is an arbitrary large prime */
-		rand = native_read_tsc();
+		rand = rdtsc();
 		rand *= 0xc345c6b72fd16123UL;
 	}
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index cc390fe69b71..f75c5908c7a6 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -735,7 +735,7 @@ static int hpet_clocksource_register(void)
 
 	/* Verify whether hpet counter works */
 	t1 = hpet_readl(HPET_COUNTER);
-	start = native_read_tsc();
+	start = rdtsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -745,7 +745,7 @@ static int hpet_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		now = native_read_tsc();
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 
 	if (t1 == hpet_readl(HPET_COUNTER)) {
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index bd8f4d41bd56..67efb8c96fc4 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -15,7 +15,7 @@ u64 notrace trace_clock_x86_tsc(void)
 	u64 ret;
 
 	rdtsc_barrier();
-	ret = native_read_tsc();
+	ret = rdtsc();
 
 	return ret;
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index e66f5dcaeb63..21d6e04e3e82 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 	data = cyc2ns_write_begin(cpu);
 
-	tsc_now = native_read_tsc();
+	tsc_now = rdtsc();
 	ns_now = cycles_2_ns(tsc_now);
 
 	/*
@@ -290,7 +290,7 @@ u64 native_sched_clock(void)
 	}
 
 	/* read the Time Stamp Counter: */
-	tsc_now = native_read_tsc();
+	tsc_now = rdtsc();
 
 	/* return the value in ns */
 	return cycles_2_ns(tsc_now);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 954e98a8c2e3..2f0ade48614f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1172,7 +1172,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 
 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 	apic->lapic_timer.expired_tscdeadline = 0;
-	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
 	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
 
 	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
@@ -1240,7 +1240,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		local_irq_save(flags);
 
 		now = apic->lapic_timer.timer.base->get_time();
-		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
 		if (likely(tscdeadline > guest_tsc)) {
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 602b974a60a6..8dfbad7a2c44 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1080,7 +1080,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
 	u64 tsc;
 
-	tsc = svm_scale_tsc(vcpu, native_read_tsc());
+	tsc = svm_scale_tsc(vcpu, rdtsc());
 
 	return target_tsc - tsc;
 }
@@ -3079,7 +3079,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	switch (msr_info->index) {
 	case MSR_IA32_TSC: {
 		msr_info->data = svm->vmcb->control.tsc_offset +
-			svm_scale_tsc(vcpu, native_read_tsc());
+			svm_scale_tsc(vcpu, rdtsc());
 
 		break;
 	}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4fa1ccad7beb..10d69a6df14f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void)
 {
 	u64 host_tsc, tsc_offset;
 
-	host_tsc = native_read_tsc();
+	host_tsc = rdtsc();
 	tsc_offset = vmcs_read64(TSC_OFFSET);
 	return host_tsc + tsc_offset;
 }
@@ -2317,7 +2317,7 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
 
 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
-	return target_tsc - native_read_tsc();
+	return target_tsc - rdtsc();
 }
 
 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f771058cfb5c..dfa97139282d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1455,7 +1455,7 @@ static cycle_t read_tsc(void)
 	 * but no one has ever seen it happen.
 	 */
 	rdtsc_barrier();
-	ret = (cycle_t)native_read_tsc();
+	ret = (cycle_t)rdtsc();
 
 	last = pvclock_gtod_data.clock.cycle_last;
 
@@ -1646,7 +1646,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		return 1;
 	}
 	if (!use_master_clock) {
-		host_tsc = native_read_tsc();
+		host_tsc = rdtsc();
 		kernel_ns = get_kernel_ns();
 	}
 
@@ -2810,7 +2810,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
-				native_read_tsc() - vcpu->arch.last_host_tsc;
+				rdtsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 		if (check_tsc_unstable()) {
@@ -2838,7 +2838,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	vcpu->arch.last_host_tsc = native_read_tsc();
+	vcpu->arch.last_host_tsc = rdtsc();
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -6623,7 +6623,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		hw_breakpoint_restore();
 
 	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
-							   native_read_tsc());
+							   rdtsc());
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
@@ -7437,7 +7437,7 @@ int kvm_arch_hardware_enable(void)
 	if (ret != 0)
 		return ret;
 
-	local_tsc = native_read_tsc();
+	local_tsc = rdtsc();
 	stable = !check_tsc_unstable();
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 35115f3786a9..f24bc59ab0a0 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -55,10 +55,10 @@ static void delay_tsc(unsigned long __loops)
 	preempt_disable();
 	cpu = smp_processor_id();
 	rdtsc_barrier();
-	bclock = native_read_tsc();
+	bclock = rdtsc();
 	for (;;) {
 		rdtsc_barrier();
-		now = native_read_tsc();
+		now = rdtsc();
 		if ((now - bclock) >= loops)
 			break;
 
@@ -80,7 +80,7 @@ static void delay_tsc(unsigned long __loops)
 			loops -= (now - bclock);
 			cpu = smp_processor_id();
 			rdtsc_barrier();
-			bclock = native_read_tsc();
+			bclock = rdtsc();
 		}
 	}
 	preempt_enable();
@@ -100,7 +100,7 @@ void use_tsc_delay(void)
 int read_current_timer(unsigned long *timer_val)
 {
 	if (delay_fn == delay_tsc) {
-		*timer_val = native_read_tsc();
+		*timer_val = rdtsc();
 		return 0;
 	}
 	return -1;
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 15ada47bb720..7c56d7eaa671 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -765,7 +765,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
 	local_irq_save(flags);
 	rdmsrl(MSR_IA32_APERF, aperf);
 	rdmsrl(MSR_IA32_MPERF, mperf);
-	tsc = native_read_tsc();
+	tsc = rdtsc();
 	local_irq_restore(flags);
 
 	cpu->last_sample_time = cpu->sample.time;
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index abc0cb22e750..4a2a9e370be7 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -149,9 +149,9 @@ static int old_gameport_measure_speed(struct gameport *gameport)
 
 	for(i = 0; i < 50; i++) {
 		local_irq_save(flags);
-		t1 = native_read_tsc();
+		t1 = rdtsc();
 		for (t = 0; t < 50; t++) gameport_read(gameport);
-		t2 = native_read_tsc();
+		t2 = rdtsc();
 		local_irq_restore(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index f871b4f00056..6f8b084e13d0 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -143,7 +143,7 @@ struct analog_port {
 
 #include <linux/i8253.h>
 
-#define GET_TIME(x)	do { if (cpu_has_tsc) x = (unsigned int)native_read_tsc(); else x = get_time_pit(); } while (0)
+#define GET_TIME(x)	do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0)
 #define DELTA(x,y)	(cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
 #define TIME_NAME	(cpu_has_tsc?"TSC":"PIT")
 static unsigned int get_time_pit(void)
@@ -160,7 +160,7 @@ static unsigned int get_time_pit(void)
         return count;
 }
 #elif defined(__x86_64__)
-#define GET_TIME(x)	do { x = (unsigned int)native_read_tsc(); } while (0)
+#define GET_TIME(x)	do { x = (unsigned int)rdtsc(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"TSC"
 #elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 44e5c3b5e0af..72c9f1f352b4 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -638,7 +638,7 @@ static int receive(struct net_device *dev, int cnt)
 #define GETTICK(x)                                                \
 ({                                                                \
 	if (cpu_has_tsc)                                          \
-		x = (unsigned int)native_read_tsc();		  \
+		x = (unsigned int)rdtsc();		  \
 })
 #else /* __i386__ */
 #define GETTICK(x)
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index ab13448defcf..2ac0c704bcb8 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -340,7 +340,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
 
 	/* check result for the last window */
 	msr_now = pkg_state_counter();
-	tsc_now = native_read_tsc();
+	tsc_now = rdtsc();
 
 	/* calculate pkg cstate vs tsc ratio */
 	if (!msr_last || !tsc_last)
@@ -482,7 +482,7 @@ static void poll_pkg_cstate(struct work_struct *dummy)
 	u64 val64;
 
 	msr_now = pkg_state_counter();
-	tsc_now = native_read_tsc();
+	tsc_now = rdtsc();
 	jiffies_now = jiffies;
 
 	/* calculate pkg cstate vs tsc ratio */
diff --git a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
index f02b0c0bff9b..6ff8383f2941 100644
--- a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
+++ b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
@@ -81,11 +81,11 @@ static int __init cpufreq_test_tsc(void)
 
 	printk(KERN_DEBUG "start--> \n");
 	then = read_pmtmr();
-	then_tsc = native_read_tsc();
+	then_tsc = rdtsc();
 	for (i=0;i<20;i++) {
 		mdelay(100);
 		now = read_pmtmr();
-		now_tsc = native_read_tsc();
+		now_tsc = rdtsc();
 		diff = (now - then) & 0xFFFFFF;
 		diff_tsc = now_tsc - then_tsc;
 		printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);
-- 
cgit v1.2.3-70-g09d2


From 03b9730b769fc4d87e40f6104f4c5b2e43889f19 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:08 +0200
Subject: x86/asm/tsc: Add rdtsc_ordered() and use it in trivial call sites

rdtsc_barrier(); rdtsc() is an unnecessary mouthful and requires
more thought than should be necessary. Add an rdtsc_ordered()
helper and replace the trivial call sites with it.

This should not change generated code. The duplication of the
fence asm is temporary.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/dddbf98a2af53312e9aa73a5a2b1622fe5d6f52b.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vclock_gettime.c | 16 ++--------------
 arch/x86/include/asm/msr.h           | 26 ++++++++++++++++++++++++++
 arch/x86/kernel/trace_clock.c        |  7 +------
 arch/x86/kvm/x86.c                   | 16 ++--------------
 arch/x86/lib/delay.c                 |  9 +++------
 5 files changed, 34 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 0340d93c18ca..ca94fa649251 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 
 notrace static cycle_t vread_tsc(void)
 {
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)rdtsc();
-
-	last = gtod->cycle_last;
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = gtod->cycle_last;
 
 	if (likely(ret >= last))
 		return ret;
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index ff0c120dafe5..02bdd6c65017 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -127,6 +127,32 @@ static __always_inline unsigned long long rdtsc(void)
 	return EAX_EDX_VAL(val, low, high);
 }
 
+/**
+ * rdtsc_ordered() - read the current TSC in program order
+ *
+ * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
+ * It is ordered like a load to a global in-memory counter.  It should
+ * be impossible to observe non-monotonic rdtsc_unordered() behavior
+ * across multiple CPUs as long as the TSC is synced.
+ */
+static __always_inline unsigned long long rdtsc_ordered(void)
+{
+	/*
+	 * The RDTSC instruction is not ordered relative to memory
+	 * access.  The Intel SDM and the AMD APM are both vague on this
+	 * point, but empirically an RDTSC instruction can be
+	 * speculatively executed before prior loads.  An RDTSC
+	 * immediately after an appropriate barrier appears to be
+	 * ordered as a normal load, that is, it provides the same
+	 * ordering guarantees as reading from a global memory location
+	 * that some other imaginary CPU is updating continuously with a
+	 * time stamp.
+	 */
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
+	return rdtsc();
+}
+
 static inline unsigned long long native_read_pmc(int counter)
 {
 	DECLARE_ARGS(val, low, high);
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index 67efb8c96fc4..80bb24d9b880 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -12,10 +12,5 @@
  */
 u64 notrace trace_clock_x86_tsc(void)
 {
-	u64 ret;
-
-	rdtsc_barrier();
-	ret = rdtsc();
-
-	return ret;
+	return rdtsc_ordered();
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dfa97139282d..8d73ec8a2364 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1444,20 +1444,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
 static cycle_t read_tsc(void)
 {
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)rdtsc();
-
-	last = pvclock_gtod_data.clock.cycle_last;
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = pvclock_gtod_data.clock.cycle_last;
 
 	if (likely(ret >= last))
 		return ret;
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index f24bc59ab0a0..4453d52a143d 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -54,11 +54,9 @@ static void delay_tsc(unsigned long __loops)
 
 	preempt_disable();
 	cpu = smp_processor_id();
-	rdtsc_barrier();
-	bclock = rdtsc();
+	bclock = rdtsc_ordered();
 	for (;;) {
-		rdtsc_barrier();
-		now = rdtsc();
+		now = rdtsc_ordered();
 		if ((now - bclock) >= loops)
 			break;
 
@@ -79,8 +77,7 @@ static void delay_tsc(unsigned long __loops)
 		if (unlikely(cpu != smp_processor_id())) {
 			loops -= (now - bclock);
 			cpu = smp_processor_id();
-			rdtsc_barrier();
-			bclock = rdtsc();
+			bclock = rdtsc_ordered();
 		}
 	}
 	preempt_enable();
-- 
cgit v1.2.3-70-g09d2


From eee6946e44510b61c35cf754f5505537c7a8eb77 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:09 +0200
Subject: x86/asm/tsc/sync: Use rdtsc_ordered() in check_tsc_warp() and drop
 extra barriers

Using get_cycles was unnecessary: check_tsc_warp() is not called
on TSC-less systems. Replace rdtsc_barrier(); get_cycles() with
rdtsc_ordered().

While we're at it, make the somewhat more dangerous change of
removing barrier_before_rdtsc after RDTSC in the TSC warp check
code. This should be okay, though -- the vDSO TSC code doesn't
have that barrier, so, if removing the barrier from the warp
check would cause us to detect a warp that we otherwise wouldn't
detect, then we have a genuine bug.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/387c4c3a75f875bcde6cd68cee013273a744f364.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc_sync.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index dd8d0791dfb5..78083bf23ed1 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -39,16 +39,15 @@ static cycles_t max_warp;
 static int nr_warps;
 
 /*
- * TSC-warp measurement loop running on both CPUs:
+ * TSC-warp measurement loop running on both CPUs.  This is not called
+ * if there is no TSC.
  */
 static void check_tsc_warp(unsigned int timeout)
 {
 	cycles_t start, now, prev, end;
 	int i;
 
-	rdtsc_barrier();
-	start = get_cycles();
-	rdtsc_barrier();
+	start = rdtsc_ordered();
 	/*
 	 * The measurement runs for 'timeout' msecs:
 	 */
@@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout)
 		 */
 		arch_spin_lock(&sync_lock);
 		prev = last_tsc;
-		rdtsc_barrier();
-		now = get_cycles();
-		rdtsc_barrier();
+		now = rdtsc_ordered();
 		last_tsc = now;
 		arch_spin_unlock(&sync_lock);
 
@@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu)
 
 	/*
 	 * No need to check if we already know that the TSC is not
-	 * synchronized:
+	 * synchronized or if we have no TSC.
 	 */
 	if (unsynchronized_tsc())
 		return;
@@ -190,6 +187,7 @@ void check_tsc_sync_target(void)
 {
 	int cpus = 2;
 
+	/* Also aborts if there is no TSC. */
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 		return;
 
-- 
cgit v1.2.3-70-g09d2


From 27c634054a3155e1d9a02f0e362e4f4ff8d28ee7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:10 +0200
Subject: x86/asm/tsc: Use rdtsc_ordered() in read_tsc() instead of
 get_cycles()

There are two logical changes here.  First, this removes a check
for cpu_has_tsc.  That check is unnecessary, as we don't
register the TSC as a clocksource on systems that have no TSC.

Second, it adds a barrier, thus preventing observable
non-monotonicity.

I suspect that the missing barrier was never a problem in
practice because system calls themselves were heavy enough
barriers to prevent user code from observing time warps due to
speculation. (Without the corresponding barrier in the vDSO,
however, non-monotonicity is easy to detect.)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/c6ff621a053127a65b70f175443578db7a0711be.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 21d6e04e3e82..451bade0d320 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -961,7 +961,7 @@ static struct clocksource clocksource_tsc;
  */
 static cycle_t read_tsc(struct clocksource *cs)
 {
-	return (cycle_t)get_cycles();
+	return (cycle_t)rdtsc_ordered();
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From c0bfd26e136cafc2b23c16225b4d7b1e14de81c1 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:10 -0400
Subject: x86/compat: Move copy_siginfo_*_user32() to signal_compat.c

copy_siginfo_to_user32() and copy_siginfo_from_user32() are used
by both the 32-bit compat and x32 ABIs.  Move them to
signal_compat.c.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-2-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32_signal.c     | 93 ----------------------------------------
 arch/x86/kernel/Makefile        |  1 +
 arch/x86/kernel/signal_compat.c | 95 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 93 deletions(-)
 create mode 100644 arch/x86/kernel/signal_compat.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index ae3a29ae875b..a0a19b7ba22d 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -34,99 +34,6 @@
 #include <asm/sys_ia32.h>
 #include <asm/smap.h>
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
-{
-	int err = 0;
-	bool ia32 = test_thread_flag(TIF_IA32);
-
-	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	put_user_try {
-		/* If you change siginfo_t structure, please make sure that
-		   this code is fixed accordingly.
-		   It should never copy any pad contained in the structure
-		   to avoid security leaks, but must copy the generic
-		   3 ints plus the relevant union member.  */
-		put_user_ex(from->si_signo, &to->si_signo);
-		put_user_ex(from->si_errno, &to->si_errno);
-		put_user_ex((short)from->si_code, &to->si_code);
-
-		if (from->si_code < 0) {
-			put_user_ex(from->si_pid, &to->si_pid);
-			put_user_ex(from->si_uid, &to->si_uid);
-			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
-		} else {
-			/*
-			 * First 32bits of unions are always present:
-			 * si_pid === si_band === si_tid === si_addr(LS half)
-			 */
-			put_user_ex(from->_sifields._pad[0],
-					  &to->_sifields._pad[0]);
-			switch (from->si_code >> 16) {
-			case __SI_FAULT >> 16:
-				break;
-			case __SI_SYS >> 16:
-				put_user_ex(from->si_syscall, &to->si_syscall);
-				put_user_ex(from->si_arch, &to->si_arch);
-				break;
-			case __SI_CHLD >> 16:
-				if (ia32) {
-					put_user_ex(from->si_utime, &to->si_utime);
-					put_user_ex(from->si_stime, &to->si_stime);
-				} else {
-					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
-					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
-				}
-				put_user_ex(from->si_status, &to->si_status);
-				/* FALL THROUGH */
-			default:
-			case __SI_KILL >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				break;
-			case __SI_POLL >> 16:
-				put_user_ex(from->si_fd, &to->si_fd);
-				break;
-			case __SI_TIMER >> 16:
-				put_user_ex(from->si_overrun, &to->si_overrun);
-				put_user_ex(ptr_to_compat(from->si_ptr),
-					    &to->si_ptr);
-				break;
-				 /* This is not generated by the kernel as of now.  */
-			case __SI_RT >> 16:
-			case __SI_MESGQ >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				put_user_ex(from->si_int, &to->si_int);
-				break;
-			}
-		}
-	} put_user_catch(err);
-
-	return err;
-}
-
-int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
-{
-	int err = 0;
-	u32 ptr32;
-
-	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	get_user_try {
-		get_user_ex(to->si_signo, &from->si_signo);
-		get_user_ex(to->si_errno, &from->si_errno);
-		get_user_ex(to->si_code, &from->si_code);
-
-		get_user_ex(to->si_pid, &from->si_pid);
-		get_user_ex(to->si_uid, &from->si_uid);
-		get_user_ex(ptr32, &from->si_ptr);
-		to->si_ptr = compat_ptr(ptr32);
-	} get_user_catch(err);
-
-	return err;
-}
-
 /*
  * Do a signal return; undo the signal stack.
  */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f15af41bd80..dc19730ad0db 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,6 +23,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o
+obj-$(CONFIG_COMPAT)	+= signal_compat.o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
new file mode 100644
index 000000000000..dc3c0b1c816f
--- /dev/null
+++ b/arch/x86/kernel/signal_compat.c
@@ -0,0 +1,95 @@
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+	int err = 0;
+	bool ia32 = test_thread_flag(TIF_IA32);
+
+	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	put_user_try {
+		/* If you change siginfo_t structure, please make sure that
+		   this code is fixed accordingly.
+		   It should never copy any pad contained in the structure
+		   to avoid security leaks, but must copy the generic
+		   3 ints plus the relevant union member.  */
+		put_user_ex(from->si_signo, &to->si_signo);
+		put_user_ex(from->si_errno, &to->si_errno);
+		put_user_ex((short)from->si_code, &to->si_code);
+
+		if (from->si_code < 0) {
+			put_user_ex(from->si_pid, &to->si_pid);
+			put_user_ex(from->si_uid, &to->si_uid);
+			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
+		} else {
+			/*
+			 * First 32bits of unions are always present:
+			 * si_pid === si_band === si_tid === si_addr(LS half)
+			 */
+			put_user_ex(from->_sifields._pad[0],
+					  &to->_sifields._pad[0]);
+			switch (from->si_code >> 16) {
+			case __SI_FAULT >> 16:
+				break;
+			case __SI_SYS >> 16:
+				put_user_ex(from->si_syscall, &to->si_syscall);
+				put_user_ex(from->si_arch, &to->si_arch);
+				break;
+			case __SI_CHLD >> 16:
+				if (ia32) {
+					put_user_ex(from->si_utime, &to->si_utime);
+					put_user_ex(from->si_stime, &to->si_stime);
+				} else {
+					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
+					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
+				}
+				put_user_ex(from->si_status, &to->si_status);
+				/* FALL THROUGH */
+			default:
+			case __SI_KILL >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				break;
+			case __SI_POLL >> 16:
+				put_user_ex(from->si_fd, &to->si_fd);
+				break;
+			case __SI_TIMER >> 16:
+				put_user_ex(from->si_overrun, &to->si_overrun);
+				put_user_ex(ptr_to_compat(from->si_ptr),
+					    &to->si_ptr);
+				break;
+				 /* This is not generated by the kernel as of now.  */
+			case __SI_RT >> 16:
+			case __SI_MESGQ >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				put_user_ex(from->si_int, &to->si_int);
+				break;
+			}
+		}
+	} put_user_catch(err);
+
+	return err;
+}
+
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+	int err = 0;
+	u32 ptr32;
+
+	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	get_user_try {
+		get_user_ex(to->si_signo, &from->si_signo);
+		get_user_ex(to->si_errno, &from->si_errno);
+		get_user_ex(to->si_code, &from->si_code);
+
+		get_user_ex(to->si_pid, &from->si_pid);
+		get_user_ex(to->si_uid, &from->si_uid);
+		get_user_ex(ptr32, &from->si_ptr);
+		to->si_ptr = compat_ptr(ptr32);
+	} get_user_catch(err);
+
+	return err;
+}
-- 
cgit v1.2.3-70-g09d2


From 7da770785f9740af1cb24b8fd63075543bd00711 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:13 -0400
Subject: x86/compat: Rename 'start_thread_ia32' to 'compat_start_thread'

This function is shared between the 32-bit compat and x32 ABIs.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-5-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/elf.h   | 4 ++--
 arch/x86/kernel/process_64.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 180b6fe8aed3..2bf67c0e9339 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t,
 #define	COMPAT_ELF_PLAT_INIT(regs, load_addr)		\
 	elf_common_init(&current->thread, regs, __USER_DS)
 
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
-#define compat_start_thread start_thread_ia32
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
+#define compat_start_thread compat_start_thread
 
 void set_personality_ia32(bool);
 #define COMPAT_SET_PERSONALITY(ex)			\
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 71d7849a07f7..0831ba3bcf95 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -248,8 +248,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 			    __USER_CS, __USER_DS, 0);
 }
 
-#ifdef CONFIG_IA32_EMULATION
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 {
 	start_thread_common(regs, new_ip, new_sp,
 			    test_thread_flag(TIF_X32)
-- 
cgit v1.2.3-70-g09d2


From 601275c3e04c43b3b34237ab36c27fc1cfb8a189 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:14 -0400
Subject: x86/compat: Factor out ia32 compat code from compat_arch_ptrace()

Move the ia32-specific code in compat_arch_ptrace() into its
own function.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-6-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/ptrace.c | 138 +++++++++++++++++++++++++----------------------
 1 file changed, 74 insertions(+), 64 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 9be72bc3613f..7155957b3c25 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1123,6 +1123,73 @@ static int genregs32_set(struct task_struct *target,
 	return ret;
 }
 
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+			     compat_ulong_t caddr, compat_ulong_t cdata)
+{
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
+	void __user *datap = compat_ptr(data);
+	int ret;
+	__u32 val;
+
+	switch (request) {
+	case PTRACE_PEEKUSR:
+		ret = getreg32(child, addr, &val);
+		if (ret == 0)
+			ret = put_user(val, (__u32 __user *)datap);
+		break;
+
+	case PTRACE_POKEUSR:
+		ret = putreg32(child, addr, data);
+		break;
+
+	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_GENERAL,
+					   0, sizeof(struct user_regs_struct32),
+					   datap);
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_GENERAL, 0,
+					     sizeof(struct user_regs_struct32),
+					     datap);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_FP, 0,
+					   sizeof(struct user_i387_ia32_struct),
+					   datap);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(
+			child, &user_x86_32_view, REGSET_FP,
+			0, sizeof(struct user_i387_ia32_struct), datap);
+
+	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_XFP, 0,
+					   sizeof(struct user32_fxsr_struct),
+					   datap);
+
+	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_XFP, 0,
+					     sizeof(struct user32_fxsr_struct),
+					     datap);
+
+	case PTRACE_GET_THREAD_AREA:
+	case PTRACE_SET_THREAD_AREA:
+		return arch_ptrace(child, request, addr, data);
+
+	default:
+		return compat_ptrace_request(child, request, addr, data);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_IA32_EMULATION */
+
 #ifdef CONFIG_X86_X32_ABI
 static long x32_arch_ptrace(struct task_struct *child,
 			    compat_long_t request, compat_ulong_t caddr,
@@ -1211,78 +1278,21 @@ static long x32_arch_ptrace(struct task_struct *child,
 }
 #endif
 
+#ifdef CONFIG_COMPAT
 long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			compat_ulong_t caddr, compat_ulong_t cdata)
 {
-	unsigned long addr = caddr;
-	unsigned long data = cdata;
-	void __user *datap = compat_ptr(data);
-	int ret;
-	__u32 val;
-
 #ifdef CONFIG_X86_X32_ABI
 	if (!is_ia32_task())
 		return x32_arch_ptrace(child, request, caddr, cdata);
 #endif
-
-	switch (request) {
-	case PTRACE_PEEKUSR:
-		ret = getreg32(child, addr, &val);
-		if (ret == 0)
-			ret = put_user(val, (__u32 __user *)datap);
-		break;
-
-	case PTRACE_POKEUSR:
-		ret = putreg32(child, addr, data);
-		break;
-
-	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_GENERAL,
-					   0, sizeof(struct user_regs_struct32),
-					   datap);
-
-	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_GENERAL, 0,
-					     sizeof(struct user_regs_struct32),
-					     datap);
-
-	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_FP, 0,
-					   sizeof(struct user_i387_ia32_struct),
-					   datap);
-
-	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
-		return copy_regset_from_user(
-			child, &user_x86_32_view, REGSET_FP,
-			0, sizeof(struct user_i387_ia32_struct), datap);
-
-	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_XFP, 0,
-					   sizeof(struct user32_fxsr_struct),
-					   datap);
-
-	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_XFP, 0,
-					     sizeof(struct user32_fxsr_struct),
-					     datap);
-
-	case PTRACE_GET_THREAD_AREA:
-	case PTRACE_SET_THREAD_AREA:
-		return arch_ptrace(child, request, addr, data);
-
-	default:
-		return compat_ptrace_request(child, request, addr, data);
-	}
-
-	return ret;
+#ifdef CONFIG_IA32_EMULATION
+	return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+	return 0;
+#endif
 }
-
-#endif	/* CONFIG_IA32_EMULATION */
+#endif	/* CONFIG_COMPAT */
 
 #ifdef CONFIG_X86_64
 
-- 
cgit v1.2.3-70-g09d2


From 10ed34935e7e828ce4ce566647a2d6b8240e4dee Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:17 -0400
Subject: x86/compat, x86/perf: Don't build perf_callchain_user32() on x32

perf_callchain_user32() is not needed for x32.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-9-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3658de47900f..641413d68a54 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2196,7 +2196,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	return get_desc_base(desc + idx);
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_IA32_EMULATION
 
 #include <asm/compat.h>
 
-- 
cgit v1.2.3-70-g09d2


From 5e2aad2460bd38d0777052486893b32902efcdcd Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:18 -0400
Subject: x86/compat: Remove unneeded #include

Including sys_ia32.h is not needed in signal.c.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-10-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/signal.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 206996c1669d..6c22aad8b909 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -35,7 +35,6 @@
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
-#include <asm/sys_ia32.h>
 #endif /* CONFIG_X86_64 */
 
 #include <asm/syscall.h>
-- 
cgit v1.2.3-70-g09d2


From 68872eb9b19bbd85883262a4e0927b487653816c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 6 Jul 2015 17:29:00 +0300
Subject: x86/platform/intel/pmc_atom: Export accessors to PMC registers

Export the pmc_atom_read() and pmc_atom_write() accessors to the PMC
registers. On early initcall stages the functions will return
-ENODEV, and caller has to wait when it will be available.

Additionally make absence of debugfs a non-fatal error.

The patch will be useful for the upcoming fixes regarding to the
LPSS block found on Intel BayTrail-T and Braswell.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Kumar P Mahesh <mahesh.kumar.p@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1436192944-56496-2-git-send-email-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pmc_atom.h |  4 ++++
 arch/x86/kernel/pmc_atom.c      | 49 ++++++++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
index bc0fc0866553..6ee220089f5d 100644
--- a/arch/x86/include/asm/pmc_atom.h
+++ b/arch/x86/include/asm/pmc_atom.h
@@ -126,4 +126,8 @@
 #define	SLEEP_TYPE_MASK		0xFFFFECFF
 #define	SLEEP_TYPE_S5		0x1C00
 #define	SLEEP_ENABLE		0x2000
+
+extern int pmc_atom_read(int offset, u32 *value);
+extern int pmc_atom_write(int offset, u32 value);
+
 #endif /* PMC_ATOM_H */
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
index d66a4fe6caee..4e1242ef45b0 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/kernel/pmc_atom.c
@@ -31,6 +31,7 @@ struct pmc_dev {
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dbgfs_dir;
 #endif /* CONFIG_DEBUG_FS */
+	bool init;
 };
 
 static struct pmc_dev pmc_device;
@@ -111,6 +112,30 @@ static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
 	writel(val, pmc->regmap + reg_offset);
 }
 
+int pmc_atom_read(int offset, u32 *value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	*value = pmc_reg_read(pmc, offset);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_read);
+
+int pmc_atom_write(int offset, u32 value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	pmc_reg_write(pmc, offset, value);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_write);
+
 static void pmc_power_off(void)
 {
 	u16	pm1_cnt_port;
@@ -250,7 +275,7 @@ static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
 	debugfs_remove_recursive(pmc->dbgfs_dir);
 }
 
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
 {
 	struct dentry *dir, *f;
 
@@ -262,24 +287,18 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
 
 	f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
 				dir, pmc, &pmc_dev_state_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "dev_state register failed\n");
+	if (!f)
 		goto err;
-	}
 
 	f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
 				dir, pmc, &pmc_pss_state_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "pss_state register failed\n");
+	if (!f)
 		goto err;
-	}
 
 	f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
 				dir, pmc, &pmc_sleep_tmr_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "sleep_state register failed\n");
+	if (!f)
 		goto err;
-	}
 
 	return 0;
 err:
@@ -287,7 +306,7 @@ err:
 	return -ENODEV;
 }
 #else
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
 {
 	return 0;
 }
@@ -318,11 +337,11 @@ static int pmc_setup_dev(struct pci_dev *pdev)
 	/* PMC hardware registers setup */
 	pmc_hw_reg_setup(pmc);
 
-	ret = pmc_dbgfs_register(pmc, pdev);
-	if (ret) {
-		iounmap(pmc->regmap);
-	}
+	ret = pmc_dbgfs_register(pmc);
+	if (ret)
+		dev_warn(&pdev->dev, "debugfs register failed\n");
 
+	pmc->init = true;
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From c3c65aa6d43f9e9f23f688848b08ffec97be893b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 6 Jul 2015 17:29:01 +0300
Subject: x86/platform/intel/pmc_atom: Print index of device in loop

The register mapping may change from one platform to another.
Thus, indices might be not the same on different platforms. The
patch makes the code to print the device index dynamically at
run time.

The patch also changes the for loop to iterate over the map
until a terminator is found.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Kumar P Mahesh <mahesh.kumar.p@intel.com>
Link: http://lkml.kernel.org/r/1436192944-56496-3-git-send-email-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/pmc_atom.c | 126 ++++++++++++++++++++++-----------------------
 1 file changed, 63 insertions(+), 63 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
index 4e1242ef45b0..c7dca07d121c 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/kernel/pmc_atom.c
@@ -43,63 +43,65 @@ struct pmc_bit_map {
 };
 
 static const struct pmc_bit_map dev_map[] = {
-	{"0  - LPSS1_F0_DMA",		BIT_LPSS1_F0_DMA},
-	{"1  - LPSS1_F1_PWM1",		BIT_LPSS1_F1_PWM1},
-	{"2  - LPSS1_F2_PWM2",		BIT_LPSS1_F2_PWM2},
-	{"3  - LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
-	{"4  - LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
-	{"5  - LPSS1_F5_SPI",		BIT_LPSS1_F5_SPI},
-	{"6  - LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
-	{"7  - LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
-	{"8  - SCC_EMMC",		BIT_SCC_EMMC},
-	{"9  - SCC_SDIO",		BIT_SCC_SDIO},
-	{"10 - SCC_SDCARD",		BIT_SCC_SDCARD},
-	{"11 - SCC_MIPI",		BIT_SCC_MIPI},
-	{"12 - HDA",			BIT_HDA},
-	{"13 - LPE",			BIT_LPE},
-	{"14 - OTG",			BIT_OTG},
-	{"15 - USH",			BIT_USH},
-	{"16 - GBE",			BIT_GBE},
-	{"17 - SATA",			BIT_SATA},
-	{"18 - USB_EHCI",		BIT_USB_EHCI},
-	{"19 - SEC",			BIT_SEC},
-	{"20 - PCIE_PORT0",		BIT_PCIE_PORT0},
-	{"21 - PCIE_PORT1",		BIT_PCIE_PORT1},
-	{"22 - PCIE_PORT2",		BIT_PCIE_PORT2},
-	{"23 - PCIE_PORT3",		BIT_PCIE_PORT3},
-	{"24 - LPSS2_F0_DMA",		BIT_LPSS2_F0_DMA},
-	{"25 - LPSS2_F1_I2C1",		BIT_LPSS2_F1_I2C1},
-	{"26 - LPSS2_F2_I2C2",		BIT_LPSS2_F2_I2C2},
-	{"27 - LPSS2_F3_I2C3",		BIT_LPSS2_F3_I2C3},
-	{"28 - LPSS2_F3_I2C4",		BIT_LPSS2_F4_I2C4},
-	{"29 - LPSS2_F5_I2C5",		BIT_LPSS2_F5_I2C5},
-	{"30 - LPSS2_F6_I2C6",		BIT_LPSS2_F6_I2C6},
-	{"31 - LPSS2_F7_I2C7",		BIT_LPSS2_F7_I2C7},
-	{"32 - SMB",			BIT_SMB},
-	{"33 - OTG_SS_PHY",		BIT_OTG_SS_PHY},
-	{"34 - USH_SS_PHY",		BIT_USH_SS_PHY},
-	{"35 - DFX",			BIT_DFX},
+	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
+	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
+	{"LPSS1_F2_PWM2",	BIT_LPSS1_F2_PWM2},
+	{"LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
+	{"LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
+	{"LPSS1_F5_SPI",	BIT_LPSS1_F5_SPI},
+	{"LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
+	{"LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
+	{"SCC_EMMC",		BIT_SCC_EMMC},
+	{"SCC_SDIO",		BIT_SCC_SDIO},
+	{"SCC_SDCARD",		BIT_SCC_SDCARD},
+	{"SCC_MIPI",		BIT_SCC_MIPI},
+	{"HDA",			BIT_HDA},
+	{"LPE",			BIT_LPE},
+	{"OTG",			BIT_OTG},
+	{"USH",			BIT_USH},
+	{"GBE",			BIT_GBE},
+	{"SATA",		BIT_SATA},
+	{"USB_EHCI",		BIT_USB_EHCI},
+	{"SEC",			BIT_SEC},
+	{"PCIE_PORT0",		BIT_PCIE_PORT0},
+	{"PCIE_PORT1",		BIT_PCIE_PORT1},
+	{"PCIE_PORT2",		BIT_PCIE_PORT2},
+	{"PCIE_PORT3",		BIT_PCIE_PORT3},
+	{"LPSS2_F0_DMA",	BIT_LPSS2_F0_DMA},
+	{"LPSS2_F1_I2C1",	BIT_LPSS2_F1_I2C1},
+	{"LPSS2_F2_I2C2",	BIT_LPSS2_F2_I2C2},
+	{"LPSS2_F3_I2C3",	BIT_LPSS2_F3_I2C3},
+	{"LPSS2_F3_I2C4",	BIT_LPSS2_F4_I2C4},
+	{"LPSS2_F5_I2C5",	BIT_LPSS2_F5_I2C5},
+	{"LPSS2_F6_I2C6",	BIT_LPSS2_F6_I2C6},
+	{"LPSS2_F7_I2C7",	BIT_LPSS2_F7_I2C7},
+	{"SMB",			BIT_SMB},
+	{"OTG_SS_PHY",		BIT_OTG_SS_PHY},
+	{"USH_SS_PHY",		BIT_USH_SS_PHY},
+	{"DFX",			BIT_DFX},
+	{},
 };
 
 static const struct pmc_bit_map pss_map[] = {
-	{"0  - GBE",			PMC_PSS_BIT_GBE},
-	{"1  - SATA",			PMC_PSS_BIT_SATA},
-	{"2  - HDA",			PMC_PSS_BIT_HDA},
-	{"3  - SEC",			PMC_PSS_BIT_SEC},
-	{"4  - PCIE",			PMC_PSS_BIT_PCIE},
-	{"5  - LPSS",			PMC_PSS_BIT_LPSS},
-	{"6  - LPE",			PMC_PSS_BIT_LPE},
-	{"7  - DFX",			PMC_PSS_BIT_DFX},
-	{"8  - USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
-	{"9  - USH_SUS",		PMC_PSS_BIT_USH_SUS},
-	{"10 - USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
-	{"11 - USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
-	{"12 - OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
-	{"13 - OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
-	{"14 - OTG_VCCA_CLK",		PMC_PSS_BIT_OTG_VCCA_CLK},
-	{"15 - OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
-	{"16 - USB",			PMC_PSS_BIT_USB},
-	{"17 - USB_SUS",		PMC_PSS_BIT_USB_SUS},
+	{"GBE",			PMC_PSS_BIT_GBE},
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"DFX",			PMC_PSS_BIT_DFX},
+	{"USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
+	{"USH_SUS",		PMC_PSS_BIT_USH_SUS},
+	{"USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
+	{"USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
+	{"OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
+	{"OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
+	{"OTG_VCCA_CLK",	PMC_PSS_BIT_OTG_VCCA_CLK},
+	{"OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
+	{"USB",			PMC_PSS_BIT_USB},
+	{"USB_SUS",		PMC_PSS_BIT_USB_SUS},
+	{},
 };
 
 static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
@@ -172,16 +174,14 @@ static int pmc_dev_state_show(struct seq_file *s, void *unused)
 	struct pmc_dev *pmc = s->private;
 	u32 func_dis, func_dis_2, func_dis_index;
 	u32 d3_sts_0, d3_sts_1, d3_sts_index;
-	int dev_num, dev_index, reg_index;
+	int dev_index, reg_index;
 
 	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
 	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
 	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
 	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
 
-	dev_num = ARRAY_SIZE(dev_map);
-
-	for (dev_index = 0; dev_index < dev_num; dev_index++) {
+	for (dev_index = 0; dev_map[dev_index].name; dev_index++) {
 		reg_index = dev_index / PMC_REG_BIT_WIDTH;
 		if (reg_index) {
 			func_dis_index = func_dis_2;
@@ -191,8 +191,8 @@ static int pmc_dev_state_show(struct seq_file *s, void *unused)
 			d3_sts_index = d3_sts_0;
 		}
 
-		seq_printf(s, "Dev: %-32s\tState: %s [%s]\n",
-			dev_map[dev_index].name,
+		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
+			dev_index, dev_map[dev_index].name,
 			dev_map[dev_index].bit_mask & func_dis_index ?
 			"Disabled" : "Enabled ",
 			dev_map[dev_index].bit_mask & d3_sts_index ?
@@ -219,9 +219,9 @@ static int pmc_pss_state_show(struct seq_file *s, void *unused)
 	u32 pss = pmc_reg_read(pmc, PMC_PSS);
 	int pss_index;
 
-	for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) {
-		seq_printf(s, "Island: %-32s\tState: %s\n",
-			pss_map[pss_index].name,
+	for (pss_index = 0; pss_map[pss_index].name; pss_index++) {
+		seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
+			pss_index, pss_map[pss_index].name,
 			pss_map[pss_index].bit_mask & pss ? "Off" : "On");
 	}
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 940406d1cfb5b35cb9716d186fe3e6308f2700c5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 6 Jul 2015 17:29:02 +0300
Subject: x86/platform/intel/pmc_atom: Supply register mappings via PMC object

The patch converts the functions to use the register mappings
provided by PMC object. It would help in case of mappings on
different platforms.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Kumar P Mahesh <mahesh.kumar.p@intel.com>
Link: http://lkml.kernel.org/r/1436192944-56496-4-git-send-email-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/pmc_atom.c | 49 ++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
index c7dca07d121c..4752b1a62910 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/kernel/pmc_atom.c
@@ -25,9 +25,20 @@
 
 #include <asm/pmc_atom.h>
 
+struct pmc_bit_map {
+	const char *name;
+	u32 bit_mask;
+};
+
+struct pmc_reg_map {
+	const struct pmc_bit_map *dev;
+	const struct pmc_bit_map *pss;
+};
+
 struct pmc_dev {
 	u32 base_addr;
 	void __iomem *regmap;
+	const struct pmc_reg_map *map;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dbgfs_dir;
 #endif /* CONFIG_DEBUG_FS */
@@ -37,11 +48,6 @@ struct pmc_dev {
 static struct pmc_dev pmc_device;
 static u32 acpi_base_addr;
 
-struct pmc_bit_map {
-	const char *name;
-	u32 bit_mask;
-};
-
 static const struct pmc_bit_map dev_map[] = {
 	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
 	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
@@ -104,6 +110,11 @@ static const struct pmc_bit_map pss_map[] = {
 	{},
 };
 
+static const struct pmc_reg_map reg_map = {
+	.dev		= dev_map,
+	.pss		= pss_map,
+};
+
 static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
 {
 	return readl(pmc->regmap + reg_offset);
@@ -172,17 +183,18 @@ static void pmc_hw_reg_setup(struct pmc_dev *pmc)
 static int pmc_dev_state_show(struct seq_file *s, void *unused)
 {
 	struct pmc_dev *pmc = s->private;
+	const struct pmc_bit_map *map = pmc->map->dev;
 	u32 func_dis, func_dis_2, func_dis_index;
 	u32 d3_sts_0, d3_sts_1, d3_sts_index;
-	int dev_index, reg_index;
+	int index, reg_index;
 
 	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
 	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
 	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
 	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
 
-	for (dev_index = 0; dev_map[dev_index].name; dev_index++) {
-		reg_index = dev_index / PMC_REG_BIT_WIDTH;
+	for (index = 0; map[index].name; index++) {
+		reg_index = index / PMC_REG_BIT_WIDTH;
 		if (reg_index) {
 			func_dis_index = func_dis_2;
 			d3_sts_index = d3_sts_1;
@@ -192,10 +204,10 @@ static int pmc_dev_state_show(struct seq_file *s, void *unused)
 		}
 
 		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
-			dev_index, dev_map[dev_index].name,
-			dev_map[dev_index].bit_mask & func_dis_index ?
+			index, map[index].name,
+			map[index].bit_mask & func_dis_index ?
 			"Disabled" : "Enabled ",
-			dev_map[dev_index].bit_mask & d3_sts_index ?
+			map[index].bit_mask & d3_sts_index ?
 			"D3" : "D0");
 	}
 	return 0;
@@ -216,13 +228,14 @@ static const struct file_operations pmc_dev_state_ops = {
 static int pmc_pss_state_show(struct seq_file *s, void *unused)
 {
 	struct pmc_dev *pmc = s->private;
+	const struct pmc_bit_map *map = pmc->map->pss;
 	u32 pss = pmc_reg_read(pmc, PMC_PSS);
-	int pss_index;
+	int index;
 
-	for (pss_index = 0; pss_map[pss_index].name; pss_index++) {
+	for (index = 0; map[index].name; index++) {
 		seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
-			pss_index, pss_map[pss_index].name,
-			pss_map[pss_index].bit_mask & pss ? "Off" : "On");
+			index, map[index].name,
+			map[index].bit_mask & pss ? "Off" : "On");
 	}
 	return 0;
 }
@@ -312,7 +325,7 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc)
 }
 #endif /* CONFIG_DEBUG_FS */
 
-static int pmc_setup_dev(struct pci_dev *pdev)
+static int pmc_setup_dev(struct pci_dev *pdev, const struct pmc_reg_map *map)
 {
 	struct pmc_dev *pmc = &pmc_device;
 	int ret;
@@ -334,6 +347,8 @@ static int pmc_setup_dev(struct pci_dev *pdev)
 		return -ENOMEM;
 	}
 
+	pmc->map = map;
+
 	/* PMC hardware registers setup */
 	pmc_hw_reg_setup(pmc);
 
@@ -376,7 +391,7 @@ static int __init pmc_atom_init(void)
 	for_each_pci_dev(pdev) {
 		ent = pci_match_id(pmc_pci_ids, pdev);
 		if (ent)
-			return pmc_setup_dev(pdev);
+			return pmc_setup_dev(pdev, &reg_map);
 	}
 	/* Device not found. */
 	return -ENODEV;
-- 
cgit v1.2.3-70-g09d2


From 2b8f8eddaf05c02bb4a21db5be1691e36e242c65 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 6 Jul 2015 17:29:03 +0300
Subject: x86/platform/intel/pmc_atom: Add Cherrytrail PMC interface

The patch adds CHT PMC interface. This exposes all the South IP
device power states and S0ix states for CHT. The bit map of
FUNC_DIS and D3_STS_0 registers for SoCs are consistent. The
D3_STS_1 and FUNC_DIS_2 registers, however, are not aligned.
This is fixed by splitting a common mapping on per register basis.

(Originally based on code from Kumar P Mahesh.)

Originally-from: Kumar P Mahesh <mahesh.kumar.p@intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1436192944-56496-5-git-send-email-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pmc_atom.h |  25 +++++++++
 arch/x86/kernel/pmc_atom.c      | 118 ++++++++++++++++++++++++++++++----------
 2 files changed, 114 insertions(+), 29 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
index 6ee220089f5d..aa8744c77c6d 100644
--- a/arch/x86/include/asm/pmc_atom.h
+++ b/arch/x86/include/asm/pmc_atom.h
@@ -18,6 +18,8 @@
 
 /* ValleyView Power Control Unit PCI Device ID */
 #define	PCI_DEVICE_ID_VLV_PMC	0x0F1C
+/* CherryTrail Power Control Unit PCI Device ID */
+#define	PCI_DEVICE_ID_CHT_PMC	0x229C
 
 /* PMC Memory mapped IO registers */
 #define	PMC_BASE_ADDR_OFFSET	0x44
@@ -29,6 +31,10 @@
 #define	PMC_FUNC_DIS		0x34
 #define	PMC_FUNC_DIS_2		0x38
 
+/* CHT specific bits in FUNC_DIS2 register */
+#define	BIT_FD_GMM		BIT(3)
+#define	BIT_FD_ISH		BIT(4)
+
 /* S0ix wake event control */
 #define	PMC_S0IX_WAKE_EN	0x3C
 
@@ -75,6 +81,21 @@
 #define PMC_PSS_BIT_USB			BIT(16)
 #define PMC_PSS_BIT_USB_SUS		BIT(17)
 
+/* CHT specific bits in PSS register */
+#define	PMC_PSS_BIT_CHT_UFS		BIT(7)
+#define	PMC_PSS_BIT_CHT_UXD		BIT(11)
+#define	PMC_PSS_BIT_CHT_UXD_FD		BIT(12)
+#define	PMC_PSS_BIT_CHT_UX_ENG		BIT(15)
+#define	PMC_PSS_BIT_CHT_USB_SUS		BIT(16)
+#define	PMC_PSS_BIT_CHT_GMM		BIT(17)
+#define	PMC_PSS_BIT_CHT_ISH		BIT(18)
+#define	PMC_PSS_BIT_CHT_DFX_MASTER	BIT(26)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER1	BIT(27)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER2	BIT(28)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER3	BIT(29)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER4	BIT(30)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER5	BIT(31)
+
 /* These registers reflect D3 status of functions */
 #define	PMC_D3_STS_0		0xA0
 
@@ -117,6 +138,10 @@
 #define	BIT_USH_SS_PHY		BIT(2)
 #define	BIT_DFX			BIT(3)
 
+/* CHT specific bits in PMC_D3_STS_1 register */
+#define	BIT_STS_GMM		BIT(1)
+#define	BIT_STS_ISH		BIT(2)
+
 /* PMC I/O Registers */
 #define	ACPI_BASE_ADDR_OFFSET	0x40
 #define	ACPI_BASE_ADDR_MASK	0xFFFFFE00
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
index 4752b1a62910..e814d341a703 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/kernel/pmc_atom.c
@@ -31,7 +31,10 @@ struct pmc_bit_map {
 };
 
 struct pmc_reg_map {
-	const struct pmc_bit_map *dev;
+	const struct pmc_bit_map *d3_sts_0;
+	const struct pmc_bit_map *d3_sts_1;
+	const struct pmc_bit_map *func_dis;
+	const struct pmc_bit_map *func_dis_2;
 	const struct pmc_bit_map *pss;
 };
 
@@ -48,7 +51,7 @@ struct pmc_dev {
 static struct pmc_dev pmc_device;
 static u32 acpi_base_addr;
 
-static const struct pmc_bit_map dev_map[] = {
+static const struct pmc_bit_map d3_sts_0_map[] = {
 	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
 	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
 	{"LPSS1_F2_PWM2",	BIT_LPSS1_F2_PWM2},
@@ -81,6 +84,10 @@ static const struct pmc_bit_map dev_map[] = {
 	{"LPSS2_F5_I2C5",	BIT_LPSS2_F5_I2C5},
 	{"LPSS2_F6_I2C6",	BIT_LPSS2_F6_I2C6},
 	{"LPSS2_F7_I2C7",	BIT_LPSS2_F7_I2C7},
+	{},
+};
+
+static struct pmc_bit_map byt_d3_sts_1_map[] = {
 	{"SMB",			BIT_SMB},
 	{"OTG_SS_PHY",		BIT_OTG_SS_PHY},
 	{"USH_SS_PHY",		BIT_USH_SS_PHY},
@@ -88,7 +95,21 @@ static const struct pmc_bit_map dev_map[] = {
 	{},
 };
 
-static const struct pmc_bit_map pss_map[] = {
+static struct pmc_bit_map cht_d3_sts_1_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_STS_GMM},
+	{"ISH",			BIT_STS_ISH},
+	{},
+};
+
+static struct pmc_bit_map cht_func_dis_2_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_FD_GMM},
+	{"ISH",			BIT_FD_ISH},
+	{},
+};
+
+static const struct pmc_bit_map byt_pss_map[] = {
 	{"GBE",			PMC_PSS_BIT_GBE},
 	{"SATA",		PMC_PSS_BIT_SATA},
 	{"HDA",			PMC_PSS_BIT_HDA},
@@ -110,9 +131,43 @@ static const struct pmc_bit_map pss_map[] = {
 	{},
 };
 
-static const struct pmc_reg_map reg_map = {
-	.dev		= dev_map,
-	.pss		= pss_map,
+static const struct pmc_bit_map cht_pss_map[] = {
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"UFS",			PMC_PSS_BIT_CHT_UFS},
+	{"UXD",			PMC_PSS_BIT_CHT_UXD},
+	{"UXD_FD",		PMC_PSS_BIT_CHT_UXD_FD},
+	{"UX_ENG",		PMC_PSS_BIT_CHT_UX_ENG},
+	{"USB_SUS",		PMC_PSS_BIT_CHT_USB_SUS},
+	{"GMM",			PMC_PSS_BIT_CHT_GMM},
+	{"ISH",			PMC_PSS_BIT_CHT_ISH},
+	{"DFX_MASTER",		PMC_PSS_BIT_CHT_DFX_MASTER},
+	{"DFX_CLUSTER1",	PMC_PSS_BIT_CHT_DFX_CLUSTER1},
+	{"DFX_CLUSTER2",	PMC_PSS_BIT_CHT_DFX_CLUSTER2},
+	{"DFX_CLUSTER3",	PMC_PSS_BIT_CHT_DFX_CLUSTER3},
+	{"DFX_CLUSTER4",	PMC_PSS_BIT_CHT_DFX_CLUSTER4},
+	{"DFX_CLUSTER5",	PMC_PSS_BIT_CHT_DFX_CLUSTER5},
+	{},
+};
+
+static const struct pmc_reg_map byt_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= byt_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= byt_d3_sts_1_map,
+	.pss		= byt_pss_map,
+};
+
+static const struct pmc_reg_map cht_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= cht_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= cht_func_dis_2_map,
+	.pss		= cht_pss_map,
 };
 
 static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
@@ -180,36 +235,39 @@ static void pmc_hw_reg_setup(struct pmc_dev *pmc)
 }
 
 #ifdef CONFIG_DEBUG_FS
+static void pmc_dev_state_print(struct seq_file *s, int reg_index,
+				u32 sts, const struct pmc_bit_map *sts_map,
+				u32 fd, const struct pmc_bit_map *fd_map)
+{
+	int offset = PMC_REG_BIT_WIDTH * reg_index;
+	int index;
+
+	for (index = 0; sts_map[index].name; index++) {
+		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
+			offset + index, sts_map[index].name,
+			fd_map[index].bit_mask & fd ?  "Disabled" : "Enabled ",
+			sts_map[index].bit_mask & sts ?  "D3" : "D0");
+	}
+}
+
 static int pmc_dev_state_show(struct seq_file *s, void *unused)
 {
 	struct pmc_dev *pmc = s->private;
-	const struct pmc_bit_map *map = pmc->map->dev;
-	u32 func_dis, func_dis_2, func_dis_index;
-	u32 d3_sts_0, d3_sts_1, d3_sts_index;
-	int index, reg_index;
+	const struct pmc_reg_map *m = pmc->map;
+	u32 func_dis, func_dis_2;
+	u32 d3_sts_0, d3_sts_1;
 
 	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
 	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
 	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
 	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
 
-	for (index = 0; map[index].name; index++) {
-		reg_index = index / PMC_REG_BIT_WIDTH;
-		if (reg_index) {
-			func_dis_index = func_dis_2;
-			d3_sts_index = d3_sts_1;
-		} else {
-			func_dis_index = func_dis;
-			d3_sts_index = d3_sts_0;
-		}
+	/* Low part */
+	pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis);
+
+	/* High part */
+	pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2);
 
-		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
-			index, map[index].name,
-			map[index].bit_mask & func_dis_index ?
-			"Disabled" : "Enabled ",
-			map[index].bit_mask & d3_sts_index ?
-			"D3" : "D0");
-	}
 	return 0;
 }
 
@@ -325,9 +383,10 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc)
 }
 #endif /* CONFIG_DEBUG_FS */
 
-static int pmc_setup_dev(struct pci_dev *pdev, const struct pmc_reg_map *map)
+static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct pmc_dev *pmc = &pmc_device;
+	const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data;
 	int ret;
 
 	/* Obtain ACPI base address */
@@ -369,7 +428,8 @@ static int pmc_setup_dev(struct pci_dev *pdev, const struct pmc_reg_map *map)
  * a driver on the same PCI id.
  */
 static const struct pci_device_id pmc_pci_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map },
 	{ 0, },
 };
 
@@ -391,7 +451,7 @@ static int __init pmc_atom_init(void)
 	for_each_pci_dev(pdev) {
 		ent = pci_match_id(pmc_pci_ids, pdev);
 		if (ent)
-			return pmc_setup_dev(pdev, &reg_map);
+			return pmc_setup_dev(pdev, ent);
 	}
 	/* Device not found. */
 	return -ENODEV;
-- 
cgit v1.2.3-70-g09d2


From 91780c41a9e03ca6c351a0b2152662139b94b274 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 6 Jul 2015 17:29:04 +0300
Subject: x86/platform/intel/pmc_atom: Move the PMC-Atom code to
 arch/x86/platform/atom

This is specific driver for Intel Atom SoCs like BayTrail and
Braswell. Let's move it to dedicated folder and alleviate a
arch/x86/kernel burden.

There is no functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Kumar P Mahesh <mahesh.kumar.p@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1436192944-56496-6-git-send-email-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/Makefile          |   1 -
 arch/x86/kernel/pmc_atom.c        | 465 --------------------------------------
 arch/x86/platform/atom/Makefile   |   3 +-
 arch/x86/platform/atom/pmc_atom.c | 465 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 467 insertions(+), 467 deletions(-)
 delete mode 100644 arch/x86/kernel/pmc_atom.c
 create mode 100644 arch/x86/platform/atom/pmc_atom.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f15af41bd80..81db53ba9da8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o
-obj-$(CONFIG_PMC_ATOM)			+= pmc_atom.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
deleted file mode 100644
index e814d341a703..000000000000
--- a/arch/x86/kernel/pmc_atom.c
+++ /dev/null
@@ -1,465 +0,0 @@
-/*
- * Intel Atom SOC Power Management Controller Driver
- * Copyright (c) 2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/device.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/io.h>
-
-#include <asm/pmc_atom.h>
-
-struct pmc_bit_map {
-	const char *name;
-	u32 bit_mask;
-};
-
-struct pmc_reg_map {
-	const struct pmc_bit_map *d3_sts_0;
-	const struct pmc_bit_map *d3_sts_1;
-	const struct pmc_bit_map *func_dis;
-	const struct pmc_bit_map *func_dis_2;
-	const struct pmc_bit_map *pss;
-};
-
-struct pmc_dev {
-	u32 base_addr;
-	void __iomem *regmap;
-	const struct pmc_reg_map *map;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dbgfs_dir;
-#endif /* CONFIG_DEBUG_FS */
-	bool init;
-};
-
-static struct pmc_dev pmc_device;
-static u32 acpi_base_addr;
-
-static const struct pmc_bit_map d3_sts_0_map[] = {
-	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
-	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
-	{"LPSS1_F2_PWM2",	BIT_LPSS1_F2_PWM2},
-	{"LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
-	{"LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
-	{"LPSS1_F5_SPI",	BIT_LPSS1_F5_SPI},
-	{"LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
-	{"LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
-	{"SCC_EMMC",		BIT_SCC_EMMC},
-	{"SCC_SDIO",		BIT_SCC_SDIO},
-	{"SCC_SDCARD",		BIT_SCC_SDCARD},
-	{"SCC_MIPI",		BIT_SCC_MIPI},
-	{"HDA",			BIT_HDA},
-	{"LPE",			BIT_LPE},
-	{"OTG",			BIT_OTG},
-	{"USH",			BIT_USH},
-	{"GBE",			BIT_GBE},
-	{"SATA",		BIT_SATA},
-	{"USB_EHCI",		BIT_USB_EHCI},
-	{"SEC",			BIT_SEC},
-	{"PCIE_PORT0",		BIT_PCIE_PORT0},
-	{"PCIE_PORT1",		BIT_PCIE_PORT1},
-	{"PCIE_PORT2",		BIT_PCIE_PORT2},
-	{"PCIE_PORT3",		BIT_PCIE_PORT3},
-	{"LPSS2_F0_DMA",	BIT_LPSS2_F0_DMA},
-	{"LPSS2_F1_I2C1",	BIT_LPSS2_F1_I2C1},
-	{"LPSS2_F2_I2C2",	BIT_LPSS2_F2_I2C2},
-	{"LPSS2_F3_I2C3",	BIT_LPSS2_F3_I2C3},
-	{"LPSS2_F3_I2C4",	BIT_LPSS2_F4_I2C4},
-	{"LPSS2_F5_I2C5",	BIT_LPSS2_F5_I2C5},
-	{"LPSS2_F6_I2C6",	BIT_LPSS2_F6_I2C6},
-	{"LPSS2_F7_I2C7",	BIT_LPSS2_F7_I2C7},
-	{},
-};
-
-static struct pmc_bit_map byt_d3_sts_1_map[] = {
-	{"SMB",			BIT_SMB},
-	{"OTG_SS_PHY",		BIT_OTG_SS_PHY},
-	{"USH_SS_PHY",		BIT_USH_SS_PHY},
-	{"DFX",			BIT_DFX},
-	{},
-};
-
-static struct pmc_bit_map cht_d3_sts_1_map[] = {
-	{"SMB",			BIT_SMB},
-	{"GMM",			BIT_STS_GMM},
-	{"ISH",			BIT_STS_ISH},
-	{},
-};
-
-static struct pmc_bit_map cht_func_dis_2_map[] = {
-	{"SMB",			BIT_SMB},
-	{"GMM",			BIT_FD_GMM},
-	{"ISH",			BIT_FD_ISH},
-	{},
-};
-
-static const struct pmc_bit_map byt_pss_map[] = {
-	{"GBE",			PMC_PSS_BIT_GBE},
-	{"SATA",		PMC_PSS_BIT_SATA},
-	{"HDA",			PMC_PSS_BIT_HDA},
-	{"SEC",			PMC_PSS_BIT_SEC},
-	{"PCIE",		PMC_PSS_BIT_PCIE},
-	{"LPSS",		PMC_PSS_BIT_LPSS},
-	{"LPE",			PMC_PSS_BIT_LPE},
-	{"DFX",			PMC_PSS_BIT_DFX},
-	{"USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
-	{"USH_SUS",		PMC_PSS_BIT_USH_SUS},
-	{"USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
-	{"USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
-	{"OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
-	{"OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
-	{"OTG_VCCA_CLK",	PMC_PSS_BIT_OTG_VCCA_CLK},
-	{"OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
-	{"USB",			PMC_PSS_BIT_USB},
-	{"USB_SUS",		PMC_PSS_BIT_USB_SUS},
-	{},
-};
-
-static const struct pmc_bit_map cht_pss_map[] = {
-	{"SATA",		PMC_PSS_BIT_SATA},
-	{"HDA",			PMC_PSS_BIT_HDA},
-	{"SEC",			PMC_PSS_BIT_SEC},
-	{"PCIE",		PMC_PSS_BIT_PCIE},
-	{"LPSS",		PMC_PSS_BIT_LPSS},
-	{"LPE",			PMC_PSS_BIT_LPE},
-	{"UFS",			PMC_PSS_BIT_CHT_UFS},
-	{"UXD",			PMC_PSS_BIT_CHT_UXD},
-	{"UXD_FD",		PMC_PSS_BIT_CHT_UXD_FD},
-	{"UX_ENG",		PMC_PSS_BIT_CHT_UX_ENG},
-	{"USB_SUS",		PMC_PSS_BIT_CHT_USB_SUS},
-	{"GMM",			PMC_PSS_BIT_CHT_GMM},
-	{"ISH",			PMC_PSS_BIT_CHT_ISH},
-	{"DFX_MASTER",		PMC_PSS_BIT_CHT_DFX_MASTER},
-	{"DFX_CLUSTER1",	PMC_PSS_BIT_CHT_DFX_CLUSTER1},
-	{"DFX_CLUSTER2",	PMC_PSS_BIT_CHT_DFX_CLUSTER2},
-	{"DFX_CLUSTER3",	PMC_PSS_BIT_CHT_DFX_CLUSTER3},
-	{"DFX_CLUSTER4",	PMC_PSS_BIT_CHT_DFX_CLUSTER4},
-	{"DFX_CLUSTER5",	PMC_PSS_BIT_CHT_DFX_CLUSTER5},
-	{},
-};
-
-static const struct pmc_reg_map byt_reg_map = {
-	.d3_sts_0	= d3_sts_0_map,
-	.d3_sts_1	= byt_d3_sts_1_map,
-	.func_dis	= d3_sts_0_map,
-	.func_dis_2	= byt_d3_sts_1_map,
-	.pss		= byt_pss_map,
-};
-
-static const struct pmc_reg_map cht_reg_map = {
-	.d3_sts_0	= d3_sts_0_map,
-	.d3_sts_1	= cht_d3_sts_1_map,
-	.func_dis	= d3_sts_0_map,
-	.func_dis_2	= cht_func_dis_2_map,
-	.pss		= cht_pss_map,
-};
-
-static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
-{
-	return readl(pmc->regmap + reg_offset);
-}
-
-static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
-{
-	writel(val, pmc->regmap + reg_offset);
-}
-
-int pmc_atom_read(int offset, u32 *value)
-{
-	struct pmc_dev *pmc = &pmc_device;
-
-	if (!pmc->init)
-		return -ENODEV;
-
-	*value = pmc_reg_read(pmc, offset);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pmc_atom_read);
-
-int pmc_atom_write(int offset, u32 value)
-{
-	struct pmc_dev *pmc = &pmc_device;
-
-	if (!pmc->init)
-		return -ENODEV;
-
-	pmc_reg_write(pmc, offset, value);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pmc_atom_write);
-
-static void pmc_power_off(void)
-{
-	u16	pm1_cnt_port;
-	u32	pm1_cnt_value;
-
-	pr_info("Preparing to enter system sleep state S5\n");
-
-	pm1_cnt_port = acpi_base_addr + PM1_CNT;
-
-	pm1_cnt_value = inl(pm1_cnt_port);
-	pm1_cnt_value &= SLEEP_TYPE_MASK;
-	pm1_cnt_value |= SLEEP_TYPE_S5;
-	pm1_cnt_value |= SLEEP_ENABLE;
-
-	outl(pm1_cnt_value, pm1_cnt_port);
-}
-
-static void pmc_hw_reg_setup(struct pmc_dev *pmc)
-{
-	/*
-	 * Disable PMC S0IX_WAKE_EN events coming from:
-	 * - LPC clock run
-	 * - GPIO_SUS ored dedicated IRQs
-	 * - GPIO_SCORE ored dedicated IRQs
-	 * - GPIO_SUS shared IRQ
-	 * - GPIO_SCORE shared IRQ
-	 */
-	pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
-}
-
-#ifdef CONFIG_DEBUG_FS
-static void pmc_dev_state_print(struct seq_file *s, int reg_index,
-				u32 sts, const struct pmc_bit_map *sts_map,
-				u32 fd, const struct pmc_bit_map *fd_map)
-{
-	int offset = PMC_REG_BIT_WIDTH * reg_index;
-	int index;
-
-	for (index = 0; sts_map[index].name; index++) {
-		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
-			offset + index, sts_map[index].name,
-			fd_map[index].bit_mask & fd ?  "Disabled" : "Enabled ",
-			sts_map[index].bit_mask & sts ?  "D3" : "D0");
-	}
-}
-
-static int pmc_dev_state_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	const struct pmc_reg_map *m = pmc->map;
-	u32 func_dis, func_dis_2;
-	u32 d3_sts_0, d3_sts_1;
-
-	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
-	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
-	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
-	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
-
-	/* Low part */
-	pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis);
-
-	/* High part */
-	pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2);
-
-	return 0;
-}
-
-static int pmc_dev_state_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_dev_state_show, inode->i_private);
-}
-
-static const struct file_operations pmc_dev_state_ops = {
-	.open		= pmc_dev_state_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int pmc_pss_state_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	const struct pmc_bit_map *map = pmc->map->pss;
-	u32 pss = pmc_reg_read(pmc, PMC_PSS);
-	int index;
-
-	for (index = 0; map[index].name; index++) {
-		seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
-			index, map[index].name,
-			map[index].bit_mask & pss ? "Off" : "On");
-	}
-	return 0;
-}
-
-static int pmc_pss_state_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_pss_state_show, inode->i_private);
-}
-
-static const struct file_operations pmc_pss_state_ops = {
-	.open		= pmc_pss_state_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
-
-	s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
-	s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
-	s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
-	s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
-	s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
-
-	seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
-	seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
-	seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
-	seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
-	seq_printf(s, "S0   Residency:\t%lldus\n", s0_tmr);
-	return 0;
-}
-
-static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_sleep_tmr_show, inode->i_private);
-}
-
-static const struct file_operations pmc_sleep_tmr_ops = {
-	.open		= pmc_sleep_tmr_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
-{
-	debugfs_remove_recursive(pmc->dbgfs_dir);
-}
-
-static int pmc_dbgfs_register(struct pmc_dev *pmc)
-{
-	struct dentry *dir, *f;
-
-	dir = debugfs_create_dir("pmc_atom", NULL);
-	if (!dir)
-		return -ENOMEM;
-
-	pmc->dbgfs_dir = dir;
-
-	f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_dev_state_ops);
-	if (!f)
-		goto err;
-
-	f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_pss_state_ops);
-	if (!f)
-		goto err;
-
-	f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_sleep_tmr_ops);
-	if (!f)
-		goto err;
-
-	return 0;
-err:
-	pmc_dbgfs_unregister(pmc);
-	return -ENODEV;
-}
-#else
-static int pmc_dbgfs_register(struct pmc_dev *pmc)
-{
-	return 0;
-}
-#endif /* CONFIG_DEBUG_FS */
-
-static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-	struct pmc_dev *pmc = &pmc_device;
-	const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data;
-	int ret;
-
-	/* Obtain ACPI base address */
-	pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
-	acpi_base_addr &= ACPI_BASE_ADDR_MASK;
-
-	/* Install power off function */
-	if (acpi_base_addr != 0 && pm_power_off == NULL)
-		pm_power_off = pmc_power_off;
-
-	pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
-	pmc->base_addr &= PMC_BASE_ADDR_MASK;
-
-	pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
-	if (!pmc->regmap) {
-		dev_err(&pdev->dev, "error: ioremap failed\n");
-		return -ENOMEM;
-	}
-
-	pmc->map = map;
-
-	/* PMC hardware registers setup */
-	pmc_hw_reg_setup(pmc);
-
-	ret = pmc_dbgfs_register(pmc);
-	if (ret)
-		dev_warn(&pdev->dev, "debugfs register failed\n");
-
-	pmc->init = true;
-	return ret;
-}
-
-/*
- * Data for PCI driver interface
- *
- * This data only exists for exporting the supported
- * PCI ids via MODULE_DEVICE_TABLE.  We do not actually
- * register a pci_driver, because lpc_ich will register
- * a driver on the same PCI id.
- */
-static const struct pci_device_id pmc_pci_ids[] = {
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map },
-	{ 0, },
-};
-
-MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
-
-static int __init pmc_atom_init(void)
-{
-	struct pci_dev *pdev = NULL;
-	const struct pci_device_id *ent;
-
-	/* We look for our device - PCU PMC
-	 * we assume that there is max. one device.
-	 *
-	 * We can't use plain pci_driver mechanism,
-	 * as the device is really a multiple function device,
-	 * main driver that binds to the pci_device is lpc_ich
-	 * and have to find & bind to the device this way.
-	 */
-	for_each_pci_dev(pdev) {
-		ent = pci_match_id(pmc_pci_ids, pdev);
-		if (ent)
-			return pmc_setup_dev(pdev, ent);
-	}
-	/* Device not found. */
-	return -ENODEV;
-}
-
-module_init(pmc_atom_init);
-/* no module_exit, this driver shouldn't be unloaded */
-
-MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
-MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile
index 0a3a40cbc794..40983f5b0858 100644
--- a/arch/x86/platform/atom/Makefile
+++ b/arch/x86/platform/atom/Makefile
@@ -1 +1,2 @@
-obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o
+obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
+obj-$(CONFIG_PUNIT_ATOM_DEBUG)	+= punit_atom_debug.o
diff --git a/arch/x86/platform/atom/pmc_atom.c b/arch/x86/platform/atom/pmc_atom.c
new file mode 100644
index 000000000000..e814d341a703
--- /dev/null
+++ b/arch/x86/platform/atom/pmc_atom.c
@@ -0,0 +1,465 @@
+/*
+ * Intel Atom SOC Power Management Controller Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+
+#include <asm/pmc_atom.h>
+
+struct pmc_bit_map {
+	const char *name;
+	u32 bit_mask;
+};
+
+struct pmc_reg_map {
+	const struct pmc_bit_map *d3_sts_0;
+	const struct pmc_bit_map *d3_sts_1;
+	const struct pmc_bit_map *func_dis;
+	const struct pmc_bit_map *func_dis_2;
+	const struct pmc_bit_map *pss;
+};
+
+struct pmc_dev {
+	u32 base_addr;
+	void __iomem *regmap;
+	const struct pmc_reg_map *map;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dbgfs_dir;
+#endif /* CONFIG_DEBUG_FS */
+	bool init;
+};
+
+static struct pmc_dev pmc_device;
+static u32 acpi_base_addr;
+
+static const struct pmc_bit_map d3_sts_0_map[] = {
+	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
+	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
+	{"LPSS1_F2_PWM2",	BIT_LPSS1_F2_PWM2},
+	{"LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
+	{"LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
+	{"LPSS1_F5_SPI",	BIT_LPSS1_F5_SPI},
+	{"LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
+	{"LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
+	{"SCC_EMMC",		BIT_SCC_EMMC},
+	{"SCC_SDIO",		BIT_SCC_SDIO},
+	{"SCC_SDCARD",		BIT_SCC_SDCARD},
+	{"SCC_MIPI",		BIT_SCC_MIPI},
+	{"HDA",			BIT_HDA},
+	{"LPE",			BIT_LPE},
+	{"OTG",			BIT_OTG},
+	{"USH",			BIT_USH},
+	{"GBE",			BIT_GBE},
+	{"SATA",		BIT_SATA},
+	{"USB_EHCI",		BIT_USB_EHCI},
+	{"SEC",			BIT_SEC},
+	{"PCIE_PORT0",		BIT_PCIE_PORT0},
+	{"PCIE_PORT1",		BIT_PCIE_PORT1},
+	{"PCIE_PORT2",		BIT_PCIE_PORT2},
+	{"PCIE_PORT3",		BIT_PCIE_PORT3},
+	{"LPSS2_F0_DMA",	BIT_LPSS2_F0_DMA},
+	{"LPSS2_F1_I2C1",	BIT_LPSS2_F1_I2C1},
+	{"LPSS2_F2_I2C2",	BIT_LPSS2_F2_I2C2},
+	{"LPSS2_F3_I2C3",	BIT_LPSS2_F3_I2C3},
+	{"LPSS2_F3_I2C4",	BIT_LPSS2_F4_I2C4},
+	{"LPSS2_F5_I2C5",	BIT_LPSS2_F5_I2C5},
+	{"LPSS2_F6_I2C6",	BIT_LPSS2_F6_I2C6},
+	{"LPSS2_F7_I2C7",	BIT_LPSS2_F7_I2C7},
+	{},
+};
+
+static struct pmc_bit_map byt_d3_sts_1_map[] = {
+	{"SMB",			BIT_SMB},
+	{"OTG_SS_PHY",		BIT_OTG_SS_PHY},
+	{"USH_SS_PHY",		BIT_USH_SS_PHY},
+	{"DFX",			BIT_DFX},
+	{},
+};
+
+static struct pmc_bit_map cht_d3_sts_1_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_STS_GMM},
+	{"ISH",			BIT_STS_ISH},
+	{},
+};
+
+static struct pmc_bit_map cht_func_dis_2_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_FD_GMM},
+	{"ISH",			BIT_FD_ISH},
+	{},
+};
+
+static const struct pmc_bit_map byt_pss_map[] = {
+	{"GBE",			PMC_PSS_BIT_GBE},
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"DFX",			PMC_PSS_BIT_DFX},
+	{"USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
+	{"USH_SUS",		PMC_PSS_BIT_USH_SUS},
+	{"USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
+	{"USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
+	{"OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
+	{"OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
+	{"OTG_VCCA_CLK",	PMC_PSS_BIT_OTG_VCCA_CLK},
+	{"OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
+	{"USB",			PMC_PSS_BIT_USB},
+	{"USB_SUS",		PMC_PSS_BIT_USB_SUS},
+	{},
+};
+
+static const struct pmc_bit_map cht_pss_map[] = {
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"UFS",			PMC_PSS_BIT_CHT_UFS},
+	{"UXD",			PMC_PSS_BIT_CHT_UXD},
+	{"UXD_FD",		PMC_PSS_BIT_CHT_UXD_FD},
+	{"UX_ENG",		PMC_PSS_BIT_CHT_UX_ENG},
+	{"USB_SUS",		PMC_PSS_BIT_CHT_USB_SUS},
+	{"GMM",			PMC_PSS_BIT_CHT_GMM},
+	{"ISH",			PMC_PSS_BIT_CHT_ISH},
+	{"DFX_MASTER",		PMC_PSS_BIT_CHT_DFX_MASTER},
+	{"DFX_CLUSTER1",	PMC_PSS_BIT_CHT_DFX_CLUSTER1},
+	{"DFX_CLUSTER2",	PMC_PSS_BIT_CHT_DFX_CLUSTER2},
+	{"DFX_CLUSTER3",	PMC_PSS_BIT_CHT_DFX_CLUSTER3},
+	{"DFX_CLUSTER4",	PMC_PSS_BIT_CHT_DFX_CLUSTER4},
+	{"DFX_CLUSTER5",	PMC_PSS_BIT_CHT_DFX_CLUSTER5},
+	{},
+};
+
+static const struct pmc_reg_map byt_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= byt_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= byt_d3_sts_1_map,
+	.pss		= byt_pss_map,
+};
+
+static const struct pmc_reg_map cht_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= cht_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= cht_func_dis_2_map,
+	.pss		= cht_pss_map,
+};
+
+static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
+{
+	return readl(pmc->regmap + reg_offset);
+}
+
+static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
+{
+	writel(val, pmc->regmap + reg_offset);
+}
+
+int pmc_atom_read(int offset, u32 *value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	*value = pmc_reg_read(pmc, offset);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_read);
+
+int pmc_atom_write(int offset, u32 value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	pmc_reg_write(pmc, offset, value);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_write);
+
+static void pmc_power_off(void)
+{
+	u16	pm1_cnt_port;
+	u32	pm1_cnt_value;
+
+	pr_info("Preparing to enter system sleep state S5\n");
+
+	pm1_cnt_port = acpi_base_addr + PM1_CNT;
+
+	pm1_cnt_value = inl(pm1_cnt_port);
+	pm1_cnt_value &= SLEEP_TYPE_MASK;
+	pm1_cnt_value |= SLEEP_TYPE_S5;
+	pm1_cnt_value |= SLEEP_ENABLE;
+
+	outl(pm1_cnt_value, pm1_cnt_port);
+}
+
+static void pmc_hw_reg_setup(struct pmc_dev *pmc)
+{
+	/*
+	 * Disable PMC S0IX_WAKE_EN events coming from:
+	 * - LPC clock run
+	 * - GPIO_SUS ored dedicated IRQs
+	 * - GPIO_SCORE ored dedicated IRQs
+	 * - GPIO_SUS shared IRQ
+	 * - GPIO_SCORE shared IRQ
+	 */
+	pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void pmc_dev_state_print(struct seq_file *s, int reg_index,
+				u32 sts, const struct pmc_bit_map *sts_map,
+				u32 fd, const struct pmc_bit_map *fd_map)
+{
+	int offset = PMC_REG_BIT_WIDTH * reg_index;
+	int index;
+
+	for (index = 0; sts_map[index].name; index++) {
+		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
+			offset + index, sts_map[index].name,
+			fd_map[index].bit_mask & fd ?  "Disabled" : "Enabled ",
+			sts_map[index].bit_mask & sts ?  "D3" : "D0");
+	}
+}
+
+static int pmc_dev_state_show(struct seq_file *s, void *unused)
+{
+	struct pmc_dev *pmc = s->private;
+	const struct pmc_reg_map *m = pmc->map;
+	u32 func_dis, func_dis_2;
+	u32 d3_sts_0, d3_sts_1;
+
+	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
+	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
+	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
+	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
+
+	/* Low part */
+	pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis);
+
+	/* High part */
+	pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2);
+
+	return 0;
+}
+
+static int pmc_dev_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pmc_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_dev_state_ops = {
+	.open		= pmc_dev_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int pmc_pss_state_show(struct seq_file *s, void *unused)
+{
+	struct pmc_dev *pmc = s->private;
+	const struct pmc_bit_map *map = pmc->map->pss;
+	u32 pss = pmc_reg_read(pmc, PMC_PSS);
+	int index;
+
+	for (index = 0; map[index].name; index++) {
+		seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
+			index, map[index].name,
+			map[index].bit_mask & pss ? "Off" : "On");
+	}
+	return 0;
+}
+
+static int pmc_pss_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pmc_pss_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_pss_state_ops = {
+	.open		= pmc_pss_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
+{
+	struct pmc_dev *pmc = s->private;
+	u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
+
+	s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
+	s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
+	s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
+	s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
+	s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
+
+	seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
+	seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
+	seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
+	seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
+	seq_printf(s, "S0   Residency:\t%lldus\n", s0_tmr);
+	return 0;
+}
+
+static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pmc_sleep_tmr_show, inode->i_private);
+}
+
+static const struct file_operations pmc_sleep_tmr_ops = {
+	.open		= pmc_sleep_tmr_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
+{
+	debugfs_remove_recursive(pmc->dbgfs_dir);
+}
+
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
+{
+	struct dentry *dir, *f;
+
+	dir = debugfs_create_dir("pmc_atom", NULL);
+	if (!dir)
+		return -ENOMEM;
+
+	pmc->dbgfs_dir = dir;
+
+	f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
+				dir, pmc, &pmc_dev_state_ops);
+	if (!f)
+		goto err;
+
+	f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
+				dir, pmc, &pmc_pss_state_ops);
+	if (!f)
+		goto err;
+
+	f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
+				dir, pmc, &pmc_sleep_tmr_ops);
+	if (!f)
+		goto err;
+
+	return 0;
+err:
+	pmc_dbgfs_unregister(pmc);
+	return -ENODEV;
+}
+#else
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
+{
+	return 0;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct pmc_dev *pmc = &pmc_device;
+	const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data;
+	int ret;
+
+	/* Obtain ACPI base address */
+	pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
+	acpi_base_addr &= ACPI_BASE_ADDR_MASK;
+
+	/* Install power off function */
+	if (acpi_base_addr != 0 && pm_power_off == NULL)
+		pm_power_off = pmc_power_off;
+
+	pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
+	pmc->base_addr &= PMC_BASE_ADDR_MASK;
+
+	pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
+	if (!pmc->regmap) {
+		dev_err(&pdev->dev, "error: ioremap failed\n");
+		return -ENOMEM;
+	}
+
+	pmc->map = map;
+
+	/* PMC hardware registers setup */
+	pmc_hw_reg_setup(pmc);
+
+	ret = pmc_dbgfs_register(pmc);
+	if (ret)
+		dev_warn(&pdev->dev, "debugfs register failed\n");
+
+	pmc->init = true;
+	return ret;
+}
+
+/*
+ * Data for PCI driver interface
+ *
+ * This data only exists for exporting the supported
+ * PCI ids via MODULE_DEVICE_TABLE.  We do not actually
+ * register a pci_driver, because lpc_ich will register
+ * a driver on the same PCI id.
+ */
+static const struct pci_device_id pmc_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map },
+	{ 0, },
+};
+
+MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
+
+static int __init pmc_atom_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	const struct pci_device_id *ent;
+
+	/* We look for our device - PCU PMC
+	 * we assume that there is max. one device.
+	 *
+	 * We can't use plain pci_driver mechanism,
+	 * as the device is really a multiple function device,
+	 * main driver that binds to the pci_device is lpc_ich
+	 * and have to find & bind to the device this way.
+	 */
+	for_each_pci_dev(pdev) {
+		ent = pci_match_id(pmc_pci_ids, pdev);
+		if (ent)
+			return pmc_setup_dev(pdev, ent);
+	}
+	/* Device not found. */
+	return -ENODEV;
+}
+
+module_init(pmc_atom_init);
+/* no module_exit, this driver shouldn't be unloaded */
+
+MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-70-g09d2


From 1f484aa6904697f390027c12fba130fa94b20831 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:23 -0700
Subject: x86/entry: Move C entry and exit code to arch/x86/entry/common.c

The entry and exit C helpers were confusingly scattered between
ptrace.c and signal.c, even though they aren't specific to
ptrace or signal handling.  Move them together in a new file.

This change just moves code around.  It doesn't change anything.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/324d686821266544d8572423cc281f961da445f4.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/Makefile       |   1 +
 arch/x86/entry/common.c       | 253 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/signal.h |   1 +
 arch/x86/kernel/ptrace.c      | 202 +--------------------------------
 arch/x86/kernel/signal.c      |  28 +----
 5 files changed, 257 insertions(+), 228 deletions(-)
 create mode 100644 arch/x86/entry/common.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 7a144971db79..bd55dedd7614 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,6 +2,7 @@
 # Makefile for the x86 low level entry code
 #
 obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
+obj-y				+= common.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
new file mode 100644
index 000000000000..917d0c3cb851
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,253 @@
+/*
+ * common.c - C code for kernel entry and exit
+ * Copyright (c) 2015 Andrew Lutomirski
+ * GPL v2
+ *
+ * Based on asm and ptrace code by many authors.  The code here originated
+ * in ptrace.c and signal.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+
+#include <asm/desc.h>
+#include <asm/traps.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+	if (arch == AUDIT_ARCH_X86_64) {
+		audit_syscall_entry(regs->orig_ax, regs->di,
+				    regs->si, regs->dx, regs->r10);
+	} else
+#endif
+	{
+		audit_syscall_entry(regs->orig_ax, regs->bx,
+				    regs->cx, regs->dx, regs->si);
+	}
+}
+
+/*
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2.  If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0:			resume the syscall
+ * 1:			go to phase 2; no seccomp phase 2 needed
+ * anything else:	go to phase 2; pass return value to seccomp
+ */
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+{
+	unsigned long ret = 0;
+	u32 work;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+	/*
+	 * If TIF_NOHZ is set, we are required to call user_exit() before
+	 * doing anything that could touch RCU.
+	 */
+	if (work & _TIF_NOHZ) {
+		user_exit();
+		work &= ~_TIF_NOHZ;
+	}
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Do seccomp first -- it should minimize exposure of other
+	 * code, and keeping seccomp fast is probably more valuable
+	 * than the rest of this.
+	 */
+	if (work & _TIF_SECCOMP) {
+		struct seccomp_data sd;
+
+		sd.arch = arch;
+		sd.nr = regs->orig_ax;
+		sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+		if (arch == AUDIT_ARCH_X86_64) {
+			sd.args[0] = regs->di;
+			sd.args[1] = regs->si;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->r10;
+			sd.args[4] = regs->r8;
+			sd.args[5] = regs->r9;
+		} else
+#endif
+		{
+			sd.args[0] = regs->bx;
+			sd.args[1] = regs->cx;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->si;
+			sd.args[4] = regs->di;
+			sd.args[5] = regs->bp;
+		}
+
+		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+		ret = seccomp_phase1(&sd);
+		if (ret == SECCOMP_PHASE1_SKIP) {
+			regs->orig_ax = -1;
+			ret = 0;
+		} else if (ret != SECCOMP_PHASE1_OK) {
+			return ret;  /* Go directly to phase 2 */
+		}
+
+		work &= ~_TIF_SECCOMP;
+	}
+#endif
+
+	/* Do our best to finish without phase 2. */
+	if (work == 0)
+		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (work == _TIF_SYSCALL_AUDIT) {
+		/*
+		 * If there is no more work to be done except auditing,
+		 * then audit in phase 1.  Phase 2 always audits, so, if
+		 * we audit here, then we can't go on to phase 2.
+		 */
+		do_audit_syscall_entry(regs, arch);
+		return 0;
+	}
+#endif
+
+	return 1;  /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+				unsigned long phase1_result)
+{
+	long ret = 0;
+	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	/*
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state.  If we entered on the slow path, TF was already set.
+	 */
+	if (work & _TIF_SINGLESTEP)
+		regs->flags |= X86_EFLAGS_TF;
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Call seccomp_phase2 before running the other hooks so that
+	 * they can see any changes made by a seccomp tracer.
+	 */
+	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
+		/* seccomp failures shouldn't expose any additional code. */
+		return -1;
+	}
+#endif
+
+	if (unlikely(work & _TIF_SYSCALL_EMU))
+		ret = -1L;
+
+	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+	    tracehook_report_syscall_entry(regs))
+		ret = -1L;
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_enter(regs, regs->orig_ax);
+
+	do_audit_syscall_entry(regs, arch);
+
+	return ret ?: regs->orig_ax;
+}
+
+long syscall_trace_enter(struct pt_regs *regs)
+{
+	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+	if (phase1_result == 0)
+		return regs->orig_ax;
+	else
+		return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
+void syscall_trace_leave(struct pt_regs *regs)
+{
+	bool step;
+
+	/*
+	 * We may come here right after calling schedule_user()
+	 * or do_notify_resume(), in which case we can be in RCU
+	 * user mode.
+	 */
+	user_exit();
+
+	audit_syscall_exit(regs);
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_exit(regs, regs->ax);
+
+	/*
+	 * If TIF_SYSCALL_EMU is set, we only get here because of
+	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+	 * We already reported this syscall instruction in
+	 * syscall_trace_enter().
+	 */
+	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
+			!test_thread_flag(TIF_SYSCALL_EMU);
+	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(regs, step);
+
+	user_enter();
+}
+
+/*
+ * notification of userspace execution resumption
+ * - triggered by the TIF_WORK_MASK flags
+ */
+__visible void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
+{
+	user_exit();
+
+	if (thread_info_flags & _TIF_UPROBE)
+		uprobe_notify_resume(regs);
+
+	/* deal with pending signal delivery */
+	if (thread_info_flags & _TIF_SIGPENDING)
+		do_signal(regs);
+
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(regs);
+	}
+	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+		fire_user_return_notifiers();
+
+	user_enter();
+}
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 31eab867e6d3..b42408bcf6b5 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -30,6 +30,7 @@ typedef sigset_t compat_sigset_t;
 #endif /* __ASSEMBLY__ */
 #include <uapi/asm/signal.h>
 #ifndef __ASSEMBLY__
+extern void do_signal(struct pt_regs *regs);
 extern void do_notify_resume(struct pt_regs *, void *, __u32);
 
 #define __ARCH_HAS_SA_RESTORER
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7155957b3c25..558f50edebca 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -37,12 +37,10 @@
 #include <asm/proto.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/traps.h>
+#include <asm/syscall.h>
 
 #include "tls.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
 enum x86_regset {
 	REGSET_GENERAL,
 	REGSET_FP,
@@ -1444,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
 }
-
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
-{
-#ifdef CONFIG_X86_64
-	if (arch == AUDIT_ARCH_X86_64) {
-		audit_syscall_entry(regs->orig_ax, regs->di,
-				    regs->si, regs->dx, regs->r10);
-	} else
-#endif
-	{
-		audit_syscall_entry(regs->orig_ax, regs->bx,
-				    regs->cx, regs->dx, regs->si);
-	}
-}
-
-/*
- * We can return 0 to resume the syscall or anything else to go to phase
- * 2.  If we resume the syscall, we need to put something appropriate in
- * regs->orig_ax.
- *
- * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
- * are fully functional.
- *
- * For phase 2's benefit, our return value is:
- * 0:			resume the syscall
- * 1:			go to phase 2; no seccomp phase 2 needed
- * anything else:	go to phase 2; pass return value to seccomp
- */
-unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
-{
-	unsigned long ret = 0;
-	u32 work;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	/*
-	 * If TIF_NOHZ is set, we are required to call user_exit() before
-	 * doing anything that could touch RCU.
-	 */
-	if (work & _TIF_NOHZ) {
-		user_exit();
-		work &= ~_TIF_NOHZ;
-	}
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Do seccomp first -- it should minimize exposure of other
-	 * code, and keeping seccomp fast is probably more valuable
-	 * than the rest of this.
-	 */
-	if (work & _TIF_SECCOMP) {
-		struct seccomp_data sd;
-
-		sd.arch = arch;
-		sd.nr = regs->orig_ax;
-		sd.instruction_pointer = regs->ip;
-#ifdef CONFIG_X86_64
-		if (arch == AUDIT_ARCH_X86_64) {
-			sd.args[0] = regs->di;
-			sd.args[1] = regs->si;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->r10;
-			sd.args[4] = regs->r8;
-			sd.args[5] = regs->r9;
-		} else
-#endif
-		{
-			sd.args[0] = regs->bx;
-			sd.args[1] = regs->cx;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->si;
-			sd.args[4] = regs->di;
-			sd.args[5] = regs->bp;
-		}
-
-		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
-		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
-
-		ret = seccomp_phase1(&sd);
-		if (ret == SECCOMP_PHASE1_SKIP) {
-			regs->orig_ax = -1;
-			ret = 0;
-		} else if (ret != SECCOMP_PHASE1_OK) {
-			return ret;  /* Go directly to phase 2 */
-		}
-
-		work &= ~_TIF_SECCOMP;
-	}
-#endif
-
-	/* Do our best to finish without phase 2. */
-	if (work == 0)
-		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
-
-#ifdef CONFIG_AUDITSYSCALL
-	if (work == _TIF_SYSCALL_AUDIT) {
-		/*
-		 * If there is no more work to be done except auditing,
-		 * then audit in phase 1.  Phase 2 always audits, so, if
-		 * we audit here, then we can't go on to phase 2.
-		 */
-		do_audit_syscall_entry(regs, arch);
-		return 0;
-	}
-#endif
-
-	return 1;  /* Something is enabled that we can't handle in phase 1 */
-}
-
-/* Returns the syscall nr to run (which should match regs->orig_ax). */
-long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
-				unsigned long phase1_result)
-{
-	long ret = 0;
-	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	/*
-	 * If we stepped into a sysenter/syscall insn, it trapped in
-	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-	 * If user-mode had set TF itself, then it's still clear from
-	 * do_debug() and we need to set it again to restore the user
-	 * state.  If we entered on the slow path, TF was already set.
-	 */
-	if (work & _TIF_SINGLESTEP)
-		regs->flags |= X86_EFLAGS_TF;
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Call seccomp_phase2 before running the other hooks so that
-	 * they can see any changes made by a seccomp tracer.
-	 */
-	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
-		/* seccomp failures shouldn't expose any additional code. */
-		return -1;
-	}
-#endif
-
-	if (unlikely(work & _TIF_SYSCALL_EMU))
-		ret = -1L;
-
-	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
-	    tracehook_report_syscall_entry(regs))
-		ret = -1L;
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_enter(regs, regs->orig_ax);
-
-	do_audit_syscall_entry(regs, arch);
-
-	return ret ?: regs->orig_ax;
-}
-
-long syscall_trace_enter(struct pt_regs *regs)
-{
-	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
-	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
-
-	if (phase1_result == 0)
-		return regs->orig_ax;
-	else
-		return syscall_trace_enter_phase2(regs, arch, phase1_result);
-}
-
-void syscall_trace_leave(struct pt_regs *regs)
-{
-	bool step;
-
-	/*
-	 * We may come here right after calling schedule_user()
-	 * or do_notify_resume(), in which case we can be in RCU
-	 * user mode.
-	 */
-	user_exit();
-
-	audit_syscall_exit(regs);
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_exit(regs, regs->ax);
-
-	/*
-	 * If TIF_SYSCALL_EMU is set, we only get here because of
-	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
-	 * We already reported this syscall instruction in
-	 * syscall_trace_enter().
-	 */
-	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
-			!test_thread_flag(TIF_SYSCALL_EMU);
-	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
-
-	user_enter();
-}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6c22aad8b909..7e88cc782712 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -700,7 +700,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  */
-static void do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 
@@ -735,32 +735,6 @@ static void do_signal(struct pt_regs *regs)
 	restore_saved_sigmask();
 }
 
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-	user_exit();
-
-	if (thread_info_flags & _TIF_UPROBE)
-		uprobe_notify_resume(regs);
-
-	/* deal with pending signal delivery */
-	if (thread_info_flags & _TIF_SIGPENDING)
-		do_signal(regs);
-
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
-		clear_thread_flag(TIF_NOTIFY_RESUME);
-		tracehook_notify_resume(regs);
-	}
-	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
-		fire_user_return_notifiers();
-
-	user_enter();
-}
-
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
 	struct task_struct *me = current;
-- 
cgit v1.2.3-70-g09d2


From 02fdcd5eac9d653d1addbd69b0c58d73650e1c00 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:24 -0700
Subject: x86/traps, context_tracking: Assert that we're in CONTEXT_KERNEL in
 exception entries

Other than the super-atomic exception entries, all exception
entries are supposed to switch our context tracking state to
CONTEXT_KERNEL. Assert that they do.  These assertions appear
trivial at this point, as exception_enter() is the function
responsible for switching context, but I'm planning on reworking
x86's exception context tracking, and these assertions will help
make sure that all of this code keeps working.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/20fa1ee2d943233a184aaf96ff75394d3b34dfba.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f5791927aa64..2a783c4fe0e9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,6 +292,8 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 	enum ctx_state prev_state = exception_enter();
 	siginfo_t info;
 
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
 		conditional_sti(regs);
@@ -376,6 +378,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	siginfo_t *info;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 		goto exit;
@@ -457,6 +460,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	conditional_sti(regs);
 
 	if (v8086_mode(regs)) {
@@ -514,6 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 		return;
 
 	prev_state = ist_enter(regs);
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
@@ -750,6 +755,7 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_MF);
 	exception_exit(prev_state);
 }
@@ -760,6 +766,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_XF);
 	exception_exit(prev_state);
 }
@@ -776,6 +783,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
@@ -805,6 +813,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	local_irq_enable();
 
 	info.si_signo = SIGILL;
-- 
cgit v1.2.3-70-g09d2


From 8c84014f3bbb112d07e73f30a10ac8a3a72f8649 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:32 -0700
Subject: x86/entry: Remove exception_enter() from most trap handlers

On 64-bit kernels, we don't need it any more: we handle context
tracking directly on entry from user mode and exit to user mode.

On 32-bit kernels, we don't support context tracking at all, so
these callbacks had no effect.

Note: this doesn't change do_page_fault().  Before we do that,
we need to make sure that there is no code that can page fault
from kernel mode with CONTEXT_USER.  The 32-bit fast system call
stack argument code is the only offender I'm aware of right now.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/ae22f4dfebd799c916574089964592be218151f9.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/traps.h         |  4 +-
 arch/x86/kernel/cpu/mcheck/mce.c     |  5 +--
 arch/x86/kernel/cpu/mcheck/p5.c      |  5 +--
 arch/x86/kernel/cpu/mcheck/winchip.c |  4 +-
 arch/x86/kernel/traps.c              | 78 +++++++++---------------------------
 5 files changed, 27 insertions(+), 69 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index c5380bea2a36..c3496619740a 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void);
 asmlinkage void smp_deferred_error_interrupt(void);
 #endif
 
-extern enum ctx_state ist_enter(struct pt_regs *regs);
-extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+extern void ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs);
 extern void ist_begin_non_atomic(struct pt_regs *regs);
 extern void ist_end_non_atomic(void);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 96cceccd11b4..99940d151e7d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1029,7 +1029,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 {
 	struct mca_config *cfg = &mca_cfg;
 	struct mce m, *final;
-	enum ctx_state prev_state;
 	int i;
 	int worst = 0;
 	int severity;
@@ -1055,7 +1054,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	this_cpu_inc(mce_exception_count);
 
@@ -1227,7 +1226,7 @@ out:
 	local_irq_disable();
 	ist_end_non_atomic();
 done:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 737b0ad4e61a..12402e10aeff 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
 /* Machine check handler for Pentium class Intel CPUs: */
 static void pentium_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
 	u32 loaddr, hi, lotype;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 
 /* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 44f138296fbe..01dd8702880b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,12 +15,12 @@
 /* Machine check handler for WinChip C6: */
 static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 2a783c4fe0e9..8e65d8a9b8db 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,13 +108,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	preempt_count_dec();
 }
 
-enum ctx_state ist_enter(struct pt_regs *regs)
+void ist_enter(struct pt_regs *regs)
 {
-	enum ctx_state prev_state;
-
 	if (user_mode(regs)) {
-		/* Other than that, we're just an exception. */
-		prev_state = exception_enter();
+		CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	} else {
 		/*
 		 * We might have interrupted pretty much anything.  In
@@ -123,32 +120,25 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 		 * but we need to notify RCU.
 		 */
 		rcu_nmi_enter();
-		prev_state = CONTEXT_KERNEL;  /* the value is irrelevant. */
 	}
 
 	/*
-	 * We are atomic because we're on the IST stack (or we're on x86_32,
-	 * in which case we still shouldn't schedule).
-	 *
-	 * This must be after exception_enter(), because exception_enter()
-	 * won't do anything if in_interrupt() returns true.
+	 * We are atomic because we're on the IST stack; or we're on
+	 * x86_32, in which case we still shouldn't schedule; or we're
+	 * on x86_64 and entered from user mode, in which case we're
+	 * still atomic unless ist_begin_non_atomic is called.
 	 */
 	preempt_count_add(HARDIRQ_OFFSET);
 
 	/* This code is a bit fragile.  Test it. */
 	rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
-
-	return prev_state;
 }
 
-void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+void ist_exit(struct pt_regs *regs)
 {
-	/* Must be before exception_exit. */
 	preempt_count_sub(HARDIRQ_OFFSET);
 
-	if (user_mode(regs))
-		return exception_exit(prev_state);
-	else
+	if (!user_mode(regs))
 		rcu_nmi_exit();
 }
 
@@ -162,7 +152,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
  * begins a non-atomic section within an ist_enter()/ist_exit() region.
  * Callers are responsible for enabling interrupts themselves inside
- * the non-atomic section, and callers must call is_end_non_atomic()
+ * the non-atomic section, and callers must call ist_end_non_atomic()
  * before ist_exit().
  */
 void ist_begin_non_atomic(struct pt_regs *regs)
@@ -289,7 +279,6 @@ NOKPROBE_SYMBOL(do_trap);
 static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 			  unsigned long trapnr, int signr)
 {
-	enum ctx_state prev_state = exception_enter();
 	siginfo_t info;
 
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
@@ -300,8 +289,6 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
-
-	exception_exit(prev_state);
 }
 
 #define DO_ERROR(trapnr, signr, str, name)				\
@@ -353,7 +340,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	}
 #endif
 
-	ist_enter(regs);  /* Discard prev_state because we won't return. */
+	ist_enter(regs);
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
 	tsk->thread.error_code = error_code;
@@ -373,15 +360,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
 	const struct bndcsr *bndcsr;
 	siginfo_t *info;
 
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
-		goto exit;
+		return;
 	conditional_sti(regs);
 
 	if (!user_mode(regs))
@@ -438,9 +423,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 		die("bounds", regs, error_code);
 	}
 
-exit:
-	exception_exit(prev_state);
 	return;
+
 exit_trap:
 	/*
 	 * This path out is for all the cases where we could not
@@ -450,36 +434,33 @@ exit_trap:
 	 * time..
 	 */
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk;
-	enum ctx_state prev_state;
 
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	conditional_sti(regs);
 
 	if (v8086_mode(regs)) {
 		local_irq_enable();
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
-		goto exit;
+		return;
 	}
 
 	tsk = current;
 	if (!user_mode(regs)) {
 		if (fixup_exception(regs))
-			goto exit;
+			return;
 
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_nr = X86_TRAP_GP;
 		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 			die("general protection fault", regs, error_code);
-		goto exit;
+		return;
 	}
 
 	tsk->thread.error_code = error_code;
@@ -495,16 +476,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	}
 
 	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-exit:
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_general_protection);
 
 /* May run on IST stack. */
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/*
 	 * ftrace must be first, everything else may cause a recursive crash.
@@ -517,7 +494,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	if (poke_int3_handler(regs))
 		return;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
@@ -544,7 +521,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	preempt_conditional_cli(regs);
 	debug_stack_usage_dec();
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_int3);
 
@@ -620,12 +597,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret);
 dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	enum ctx_state prev_state;
 	int user_icebp = 0;
 	unsigned long dr6;
 	int si_code;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	get_debugreg(dr6, 6);
 
@@ -700,7 +676,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();
 
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);
 
@@ -752,23 +728,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_MF);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_XF);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
@@ -780,9 +748,6 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	BUG_ON(use_eager_fpu());
 
@@ -794,7 +759,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 
 		info.regs = regs;
 		math_emulate(&info);
-		exception_exit(prev_state);
 		return;
 	}
 #endif
@@ -802,7 +766,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 #endif
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_device_not_available);
 
@@ -810,9 +773,7 @@ NOKPROBE_SYMBOL(do_device_not_available);
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
 	siginfo_t info;
-	enum ctx_state prev_state;
 
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	local_irq_enable();
 
@@ -825,7 +786,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 			&info);
 	}
-	exception_exit(prev_state);
 }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 0333a209cbf600e980fc55c24878a56f25f48b65 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:34 -0700
Subject: x86/irq, context_tracking: Document how IRQ context tracking works
 and add an RCU assertion

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/e8bdc4ed0193fb2fd130f3d6b7b8023e2ec1ab62.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/irq.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 88b366487b0e..6233de046c08 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -216,8 +216,23 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 
+	/*
+	 * NB: Unlike exception entries, IRQ entries do not reliably
+	 * handle context tracking in the low-level entry code.  This is
+	 * because syscall entries execute briefly with IRQs on before
+	 * updating context tracking state, so we can take an IRQ from
+	 * kernel mode with CONTEXT_USER.  The low-level entry code only
+	 * updates the context if we came from user mode, so we won't
+	 * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
+	 * code is cleaned up enough that we can cleanly defer enabling
+	 * IRQs.
+	 */
+
 	entering_irq();
 
+	/* entering_irq() tells RCU that we're not quiescent.  Check it. */
+	rcu_lockdep_assert(rcu_is_watching(), "IRQ failed to wake up RCU");
+
 	irq = __this_cpu_read(vector_irq[vector]);
 
 	if (!handle_irq(irq, regs)) {
-- 
cgit v1.2.3-70-g09d2


From 9d87cd61a6b71ee00b7576a3ebc10208becdbea1 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 7 Jul 2015 18:26:13 +0200
Subject: x86/irq: Hide 'HYP:' line in /proc/interrupts when not on Xen/Hyper-V
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hypervisor callback interrupts are only accounted on
Xen/Hyper-V. There is no point in having always-zero HYP: line
on other hypervisors or bare metal. Print the line only if
HYPERVISOR_CALLBACK_VECTOR was allocated.

Reported-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Andrew Jones <drjones@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1436286373-11908-1-git-send-email-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/irq.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 88b366487b0e..5b537479de83 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -139,10 +139,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 	seq_puts(p, "  Machine check polls\n");
 #endif
 #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
-	seq_printf(p, "%*s: ", prec, "HYP");
-	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
-	seq_puts(p, "  Hypervisor callback interrupts\n");
+	if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) {
+		seq_printf(p, "%*s: ", prec, "HYP");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ",
+				   irq_stats(j)->irq_hv_callback_count);
+		seq_puts(p, "  Hypervisor callback interrupts\n");
+	}
 #endif
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
-- 
cgit v1.2.3-70-g09d2


From 5f2dbbc51734fc51e8e3e2c3ab7096a58ac72e86 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:14 +0800
Subject: x86/irq: Use accessor irq_data_get_node()

Use accessor irq_data_get_node() to hide struct irq_data
implementation detail, so we can move node to irq_data_common later.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/vector.c | 8 ++++----
 arch/x86/platform/uv/uv_irq.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 28eba2d38b15..9b62f690b0ff 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -296,7 +296,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 	struct irq_alloc_info *info = arg;
 	struct apic_chip_data *data;
 	struct irq_data *irq_data;
-	int i, err;
+	int i, err, node;
 
 	if (disable_apic)
 		return -ENXIO;
@@ -308,12 +308,13 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 	for (i = 0; i < nr_irqs; i++) {
 		irq_data = irq_domain_get_irq_data(domain, virq + i);
 		BUG_ON(!irq_data);
+		node = irq_data_get_node(irq_data);
 #ifdef	CONFIG_X86_IO_APIC
 		if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
 			data = legacy_irq_data[virq + i];
 		else
 #endif
-			data = alloc_apic_chip_data(irq_data->node);
+			data = alloc_apic_chip_data(node);
 		if (!data) {
 			err = -ENOMEM;
 			goto error;
@@ -322,8 +323,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 		irq_data->chip = &lapic_controller;
 		irq_data->chip_data = data;
 		irq_data->hwirq = virq + i;
-		err = assign_irq_vector_policy(virq, irq_data->node, data,
-					       info);
+		err = assign_irq_vector_policy(virq, node, data, info);
 		if (err)
 			goto error;
 	}
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 8570abe68be1..e1c24631afbb 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -89,7 +89,7 @@ static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
 		return -EINVAL;
 
 	chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL,
-				 irq_data->node);
+				 irq_data_get_node(irq_data));
 	if (!chip_data)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-70-g09d2


From ff96b4d0333baa001b404882b28b7d992b02415b Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:18 +0800
Subject: x86/irq: Use accessor irq_data_get_irq_handler_data()

Use accessor function irq_data_get_irq_handler_data() to hide irq_desc
implementation details.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/msi.c | 2 +-
 arch/x86/kernel/hpet.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 1a9d735e09c6..5f1feb6854af 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -264,7 +264,7 @@ static inline int hpet_dev_id(struct irq_domain *domain)
 
 static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	hpet_msi_write(data->handler_data, msg);
+	hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
 }
 
 static struct irq_chip hpet_msi_controller = {
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 10757d0a3fcf..2de7e4e07dfa 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -426,7 +426,7 @@ static struct irq_domain *hpet_domain;
 
 void hpet_msi_unmask(struct irq_data *data)
 {
-	struct hpet_dev *hdev = data->handler_data;
+	struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
 
 	/* unmask it */
@@ -437,7 +437,7 @@ void hpet_msi_unmask(struct irq_data *data)
 
 void hpet_msi_mask(struct irq_data *data)
 {
-	struct hpet_dev *hdev = data->handler_data;
+	struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
 
 	/* mask it */
-- 
cgit v1.2.3-70-g09d2


From c149e4cd08ba01f4d2d0104f469d5f5419294e06 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Wed, 3 Jun 2015 11:46:22 +0800
Subject: x86/irq: Use access helper irq_data_get_affinity_mask()

This is a preparatory patch for moving irq_data struct members.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/io_apic.c   | 2 +-
 arch/x86/kernel/apic/vector.c    | 3 ++-
 arch/x86/kernel/irq.c            | 5 +++--
 drivers/xen/events/events_base.c | 4 ++--
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 845dc0df2002..09921de4210f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2541,7 +2541,7 @@ void __init setup_ioapic_dest(void)
 		 * Honour affinities which have been set in early boot
 		 */
 		if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
-			mask = idata->affinity;
+			mask = irq_data_get_affinity_mask(idata);
 		else
 			mask = apic->target_cpus();
 
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 9b62f690b0ff..7ad911ea4f56 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -496,7 +496,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
 	if (err) {
 		struct irq_data *top = irq_get_irq_data(irq);
 
-		if (assign_irq_vector(irq, data, top->affinity))
+		if (assign_irq_vector(irq, data,
+				      irq_data_get_affinity_mask(top)))
 			pr_err("Failed to recover vector for irq %d\n", irq);
 		return err;
 	}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 88b366487b0e..7ed9cba27637 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -348,7 +348,8 @@ int check_irq_vectors_for_cpu_disable(void)
 				continue;
 
 			data = irq_desc_get_irq_data(desc);
-			cpumask_copy(&affinity_new, data->affinity);
+			cpumask_copy(&affinity_new,
+				     irq_data_get_affinity_mask(data));
 			cpumask_clear_cpu(this_cpu, &affinity_new);
 
 			/* Do not count inactive or per-cpu irqs. */
@@ -426,7 +427,7 @@ void fixup_irqs(void)
 		raw_spin_lock(&desc->lock);
 
 		data = irq_desc_get_irq_data(desc);
-		affinity = data->affinity;
+		affinity = irq_data_get_affinity_mask(data);
 		if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
 		    cpumask_subset(affinity, cpu_online_mask)) {
 			raw_spin_unlock(&desc->lock);
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 96093ae369a5..ed8bf1067a97 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -336,7 +336,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 
 	BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-	cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu));
+	cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu));
 #endif
 	xen_evtchn_port_bind_to_cpu(info, cpu);
 
@@ -373,7 +373,7 @@ static void xen_irq_init(unsigned irq)
 	struct irq_info *info;
 #ifdef CONFIG_SMP
 	/* By default all event channels notify CPU#0. */
-	cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0));
+	cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0));
 #endif
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
-- 
cgit v1.2.3-70-g09d2


From ce0d3c0a6fb1422101498ef378c0851dabbbf67f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Jul 2015 22:03:57 +0200
Subject: genirq: Revert sparse irq locking around __cpu_up() and move it to
 x86 for now

Boris reported that the sparse_irq protection around __cpu_up() in the
generic code causes a regression on Xen. Xen allocates interrupts and
some more in the xen_cpu_up() function, so it deadlocks on the
sparse_irq_lock.

There is no simple fix for this and we really should have the
protection for all architectures, but for now the only solution is to
move it to x86 where actual wreckage due to the lack of protection has
been observed.

Reported-and-tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Fixes: a89941816726 'hotplug: Prevent alloc/free of irq descriptors during cpu up/down'
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: xiao jin <jin.xiao@intel.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Cc: xen-devel <xen-devel@lists.xenproject.org>
---
 arch/x86/kernel/smpboot.c | 11 +++++++++++
 kernel/cpu.c              |  9 ---------
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d3010aa79daf..b1f3ed9c7a9e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -992,8 +992,17 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 
 	common_cpu_up(cpu, tidle);
 
+	/*
+	 * We have to walk the irq descriptors to setup the vector
+	 * space for the cpu which comes online.  Prevent irq
+	 * alloc/free across the bringup.
+	 */
+	irq_lock_sparse();
+
 	err = do_boot_cpu(apicid, cpu, tidle);
+
 	if (err) {
+		irq_unlock_sparse();
 		pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
 		return -EIO;
 	}
@@ -1011,6 +1020,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 		touch_nmi_watchdog();
 	}
 
+	irq_unlock_sparse();
+
 	return 0;
 }
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6a374544d495..5644ec5582b9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -527,18 +527,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	}
 
-	/*
-	 * Some architectures have to walk the irq descriptors to
-	 * setup the vector space for the cpu which comes online.
-	 * Prevent irq alloc/free across the bringup.
-	 */
-	irq_lock_sparse();
-
 	/* Arch-specific enabling code. */
 	ret = __cpu_up(cpu, idle);
 
-	irq_unlock_sparse();
-
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
-- 
cgit v1.2.3-70-g09d2


From 23ae2a16bb39d999892a86a65933fe3e9b6b525f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 8 Jul 2015 17:45:05 +0300
Subject: x86/platform/iosf_mbi: Move to dedicated folder

Move the driver to arch/x86/platform/intel since it is not a core
kernel code and it is related to many Intel SoCs from different
groups: Atom, MID, etc.

There is no functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: David E . Box <david.e.box@linux.intel.com>
Link: http://lkml.kernel.org/r/1436366709-17683-2-git-send-email-andriy.shevchenko@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile           |   1 -
 arch/x86/kernel/iosf_mbi.c         | 328 -------------------------------------
 arch/x86/platform/Makefile         |   1 +
 arch/x86/platform/intel/Makefile   |   1 +
 arch/x86/platform/intel/iosf_mbi.c | 328 +++++++++++++++++++++++++++++++++++++
 5 files changed, 330 insertions(+), 329 deletions(-)
 delete mode 100644 arch/x86/kernel/iosf_mbi.c
 create mode 100644 arch/x86/platform/intel/Makefile
 create mode 100644 arch/x86/platform/intel/iosf_mbi.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 81db53ba9da8..7041f8b5161b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -107,7 +107,6 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o
 
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
-obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
deleted file mode 100644
index 82f8d02f0df2..000000000000
--- a/arch/x86/kernel/iosf_mbi.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * IOSF-SB MailBox Interface Driver
- * Copyright (c) 2013, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- *
- * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
- * mailbox interface (MBI) to communicate with mutiple devices. This
- * driver implements access to this interface for those platforms that can
- * enumerate the device using PCI.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/debugfs.h>
-#include <linux/capability.h>
-
-#include <asm/iosf_mbi.h>
-
-#define PCI_DEVICE_ID_BAYTRAIL		0x0F00
-#define PCI_DEVICE_ID_BRASWELL		0x2280
-#define PCI_DEVICE_ID_QUARK_X1000	0x0958
-
-static DEFINE_SPINLOCK(iosf_mbi_lock);
-
-static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
-{
-	return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
-}
-
-static struct pci_dev *mbi_pdev;	/* one mbi device */
-
-static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
-{
-	int result;
-
-	if (!mbi_pdev)
-		return -ENODEV;
-
-	if (mcrx) {
-		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
-						mcrx);
-		if (result < 0)
-			goto fail_read;
-	}
-
-	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
-	if (result < 0)
-		goto fail_read;
-
-	result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
-	if (result < 0)
-		goto fail_read;
-
-	return 0;
-
-fail_read:
-	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
-	return result;
-}
-
-static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
-{
-	int result;
-
-	if (!mbi_pdev)
-		return -ENODEV;
-
-	result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
-	if (result < 0)
-		goto fail_write;
-
-	if (mcrx) {
-		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
-						mcrx);
-		if (result < 0)
-			goto fail_write;
-	}
-
-	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
-	if (result < 0)
-		goto fail_write;
-
-	return 0;
-
-fail_write:
-	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
-	return result;
-}
-
-int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
-{
-	u32 mcr, mcrx;
-	unsigned long flags;
-	int ret;
-
-	/*Access to the GFX unit is handled by GPU code */
-	if (port == BT_MBI_UNIT_GFX) {
-		WARN_ON(1);
-		return -EPERM;
-	}
-
-	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
-	mcrx = offset & MBI_MASK_HI;
-
-	spin_lock_irqsave(&iosf_mbi_lock, flags);
-	ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
-	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(iosf_mbi_read);
-
-int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
-{
-	u32 mcr, mcrx;
-	unsigned long flags;
-	int ret;
-
-	/*Access to the GFX unit is handled by GPU code */
-	if (port == BT_MBI_UNIT_GFX) {
-		WARN_ON(1);
-		return -EPERM;
-	}
-
-	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
-	mcrx = offset & MBI_MASK_HI;
-
-	spin_lock_irqsave(&iosf_mbi_lock, flags);
-	ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
-	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(iosf_mbi_write);
-
-int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
-{
-	u32 mcr, mcrx;
-	u32 value;
-	unsigned long flags;
-	int ret;
-
-	/*Access to the GFX unit is handled by GPU code */
-	if (port == BT_MBI_UNIT_GFX) {
-		WARN_ON(1);
-		return -EPERM;
-	}
-
-	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
-	mcrx = offset & MBI_MASK_HI;
-
-	spin_lock_irqsave(&iosf_mbi_lock, flags);
-
-	/* Read current mdr value */
-	ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
-	if (ret < 0) {
-		spin_unlock_irqrestore(&iosf_mbi_lock, flags);
-		return ret;
-	}
-
-	/* Apply mask */
-	value &= ~mask;
-	mdr &= mask;
-	value |= mdr;
-
-	/* Write back */
-	ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
-
-	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(iosf_mbi_modify);
-
-bool iosf_mbi_available(void)
-{
-	/* Mbi isn't hot-pluggable. No remove routine is provided */
-	return mbi_pdev;
-}
-EXPORT_SYMBOL(iosf_mbi_available);
-
-#ifdef CONFIG_IOSF_MBI_DEBUG
-static u32	dbg_mdr;
-static u32	dbg_mcr;
-static u32	dbg_mcrx;
-
-static int mcr_get(void *data, u64 *val)
-{
-	*val = *(u32 *)data;
-	return 0;
-}
-
-static int mcr_set(void *data, u64 val)
-{
-	u8 command = ((u32)val & 0xFF000000) >> 24,
-	   port	   = ((u32)val & 0x00FF0000) >> 16,
-	   offset  = ((u32)val & 0x0000FF00) >> 8;
-	int err;
-
-	*(u32 *)data = val;
-
-	if (!capable(CAP_SYS_RAWIO))
-		return -EACCES;
-
-	if (command & 1u)
-		err = iosf_mbi_write(port,
-			       command,
-			       dbg_mcrx | offset,
-			       dbg_mdr);
-	else
-		err = iosf_mbi_read(port,
-			      command,
-			      dbg_mcrx | offset,
-			      &dbg_mdr);
-
-	return err;
-}
-DEFINE_SIMPLE_ATTRIBUTE(iosf_mcr_fops, mcr_get, mcr_set , "%llx\n");
-
-static struct dentry *iosf_dbg;
-
-static void iosf_sideband_debug_init(void)
-{
-	struct dentry *d;
-
-	iosf_dbg = debugfs_create_dir("iosf_sb", NULL);
-	if (IS_ERR_OR_NULL(iosf_dbg))
-		return;
-
-	/* mdr */
-	d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
-	if (IS_ERR_OR_NULL(d))
-		goto cleanup;
-
-	/* mcrx */
-	debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
-	if (IS_ERR_OR_NULL(d))
-		goto cleanup;
-
-	/* mcr - initiates mailbox tranaction */
-	debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
-	if (IS_ERR_OR_NULL(d))
-		goto cleanup;
-
-	return;
-
-cleanup:
-	debugfs_remove_recursive(d);
-}
-
-static void iosf_debugfs_init(void)
-{
-	iosf_sideband_debug_init();
-}
-
-static void iosf_debugfs_remove(void)
-{
-	debugfs_remove_recursive(iosf_dbg);
-}
-#else
-static inline void iosf_debugfs_init(void) { }
-static inline void iosf_debugfs_remove(void) { }
-#endif /* CONFIG_IOSF_MBI_DEBUG */
-
-static int iosf_mbi_probe(struct pci_dev *pdev,
-			  const struct pci_device_id *unused)
-{
-	int ret;
-
-	ret = pci_enable_device(pdev);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "error: could not enable device\n");
-		return ret;
-	}
-
-	mbi_pdev = pci_dev_get(pdev);
-	return 0;
-}
-
-static const struct pci_device_id iosf_mbi_pci_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
-
-static struct pci_driver iosf_mbi_pci_driver = {
-	.name		= "iosf_mbi_pci",
-	.probe		= iosf_mbi_probe,
-	.id_table	= iosf_mbi_pci_ids,
-};
-
-static int __init iosf_mbi_init(void)
-{
-	iosf_debugfs_init();
-
-	return pci_register_driver(&iosf_mbi_pci_driver);
-}
-
-static void __exit iosf_mbi_exit(void)
-{
-	iosf_debugfs_remove();
-
-	pci_unregister_driver(&iosf_mbi_pci_driver);
-	if (mbi_pdev) {
-		pci_dev_put(mbi_pdev);
-		mbi_pdev = NULL;
-	}
-}
-
-module_init(iosf_mbi_init);
-module_exit(iosf_mbi_exit);
-
-MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
-MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index f1a6c8e86ddd..184842ef332e 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -5,6 +5,7 @@ obj-y	+= efi/
 obj-y	+= geode/
 obj-y	+= goldfish/
 obj-y	+= iris/
+obj-y	+= intel/
 obj-y	+= intel-mid/
 obj-y	+= intel-quark/
 obj-y	+= olpc/
diff --git a/arch/x86/platform/intel/Makefile b/arch/x86/platform/intel/Makefile
new file mode 100644
index 000000000000..b878032fbc82
--- /dev/null
+++ b/arch/x86/platform/intel/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o
diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c
new file mode 100644
index 000000000000..82f8d02f0df2
--- /dev/null
+++ b/arch/x86/platform/intel/iosf_mbi.c
@@ -0,0 +1,328 @@
+/*
+ * IOSF-SB MailBox Interface Driver
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ *
+ * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
+ * mailbox interface (MBI) to communicate with mutiple devices. This
+ * driver implements access to this interface for those platforms that can
+ * enumerate the device using PCI.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/capability.h>
+
+#include <asm/iosf_mbi.h>
+
+#define PCI_DEVICE_ID_BAYTRAIL		0x0F00
+#define PCI_DEVICE_ID_BRASWELL		0x2280
+#define PCI_DEVICE_ID_QUARK_X1000	0x0958
+
+static DEFINE_SPINLOCK(iosf_mbi_lock);
+
+static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
+{
+	return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
+}
+
+static struct pci_dev *mbi_pdev;	/* one mbi device */
+
+static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
+{
+	int result;
+
+	if (!mbi_pdev)
+		return -ENODEV;
+
+	if (mcrx) {
+		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+						mcrx);
+		if (result < 0)
+			goto fail_read;
+	}
+
+	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+	if (result < 0)
+		goto fail_read;
+
+	result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+	if (result < 0)
+		goto fail_read;
+
+	return 0;
+
+fail_read:
+	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+	return result;
+}
+
+static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
+{
+	int result;
+
+	if (!mbi_pdev)
+		return -ENODEV;
+
+	result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+	if (result < 0)
+		goto fail_write;
+
+	if (mcrx) {
+		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+						mcrx);
+		if (result < 0)
+			goto fail_write;
+	}
+
+	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+	if (result < 0)
+		goto fail_write;
+
+	return 0;
+
+fail_write:
+	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+	return result;
+}
+
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+	u32 mcr, mcrx;
+	unsigned long flags;
+	int ret;
+
+	/*Access to the GFX unit is handled by GPU code */
+	if (port == BT_MBI_UNIT_GFX) {
+		WARN_ON(1);
+		return -EPERM;
+	}
+
+	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+	mcrx = offset & MBI_MASK_HI;
+
+	spin_lock_irqsave(&iosf_mbi_lock, flags);
+	ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
+	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_read);
+
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+	u32 mcr, mcrx;
+	unsigned long flags;
+	int ret;
+
+	/*Access to the GFX unit is handled by GPU code */
+	if (port == BT_MBI_UNIT_GFX) {
+		WARN_ON(1);
+		return -EPERM;
+	}
+
+	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+	mcrx = offset & MBI_MASK_HI;
+
+	spin_lock_irqsave(&iosf_mbi_lock, flags);
+	ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
+	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_write);
+
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+	u32 mcr, mcrx;
+	u32 value;
+	unsigned long flags;
+	int ret;
+
+	/*Access to the GFX unit is handled by GPU code */
+	if (port == BT_MBI_UNIT_GFX) {
+		WARN_ON(1);
+		return -EPERM;
+	}
+
+	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+	mcrx = offset & MBI_MASK_HI;
+
+	spin_lock_irqsave(&iosf_mbi_lock, flags);
+
+	/* Read current mdr value */
+	ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
+	if (ret < 0) {
+		spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+		return ret;
+	}
+
+	/* Apply mask */
+	value &= ~mask;
+	mdr &= mask;
+	value |= mdr;
+
+	/* Write back */
+	ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
+
+	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_modify);
+
+bool iosf_mbi_available(void)
+{
+	/* Mbi isn't hot-pluggable. No remove routine is provided */
+	return mbi_pdev;
+}
+EXPORT_SYMBOL(iosf_mbi_available);
+
+#ifdef CONFIG_IOSF_MBI_DEBUG
+static u32	dbg_mdr;
+static u32	dbg_mcr;
+static u32	dbg_mcrx;
+
+static int mcr_get(void *data, u64 *val)
+{
+	*val = *(u32 *)data;
+	return 0;
+}
+
+static int mcr_set(void *data, u64 val)
+{
+	u8 command = ((u32)val & 0xFF000000) >> 24,
+	   port	   = ((u32)val & 0x00FF0000) >> 16,
+	   offset  = ((u32)val & 0x0000FF00) >> 8;
+	int err;
+
+	*(u32 *)data = val;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EACCES;
+
+	if (command & 1u)
+		err = iosf_mbi_write(port,
+			       command,
+			       dbg_mcrx | offset,
+			       dbg_mdr);
+	else
+		err = iosf_mbi_read(port,
+			      command,
+			      dbg_mcrx | offset,
+			      &dbg_mdr);
+
+	return err;
+}
+DEFINE_SIMPLE_ATTRIBUTE(iosf_mcr_fops, mcr_get, mcr_set , "%llx\n");
+
+static struct dentry *iosf_dbg;
+
+static void iosf_sideband_debug_init(void)
+{
+	struct dentry *d;
+
+	iosf_dbg = debugfs_create_dir("iosf_sb", NULL);
+	if (IS_ERR_OR_NULL(iosf_dbg))
+		return;
+
+	/* mdr */
+	d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
+	if (IS_ERR_OR_NULL(d))
+		goto cleanup;
+
+	/* mcrx */
+	debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
+	if (IS_ERR_OR_NULL(d))
+		goto cleanup;
+
+	/* mcr - initiates mailbox tranaction */
+	debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
+	if (IS_ERR_OR_NULL(d))
+		goto cleanup;
+
+	return;
+
+cleanup:
+	debugfs_remove_recursive(d);
+}
+
+static void iosf_debugfs_init(void)
+{
+	iosf_sideband_debug_init();
+}
+
+static void iosf_debugfs_remove(void)
+{
+	debugfs_remove_recursive(iosf_dbg);
+}
+#else
+static inline void iosf_debugfs_init(void) { }
+static inline void iosf_debugfs_remove(void) { }
+#endif /* CONFIG_IOSF_MBI_DEBUG */
+
+static int iosf_mbi_probe(struct pci_dev *pdev,
+			  const struct pci_device_id *unused)
+{
+	int ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "error: could not enable device\n");
+		return ret;
+	}
+
+	mbi_pdev = pci_dev_get(pdev);
+	return 0;
+}
+
+static const struct pci_device_id iosf_mbi_pci_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
+	{ 0, },
+};
+MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
+
+static struct pci_driver iosf_mbi_pci_driver = {
+	.name		= "iosf_mbi_pci",
+	.probe		= iosf_mbi_probe,
+	.id_table	= iosf_mbi_pci_ids,
+};
+
+static int __init iosf_mbi_init(void)
+{
+	iosf_debugfs_init();
+
+	return pci_register_driver(&iosf_mbi_pci_driver);
+}
+
+static void __exit iosf_mbi_exit(void)
+{
+	iosf_debugfs_remove();
+
+	pci_unregister_driver(&iosf_mbi_pci_driver);
+	if (mbi_pdev) {
+		pci_dev_put(mbi_pdev);
+		mbi_pdev = NULL;
+	}
+}
+
+module_init(iosf_mbi_init);
+module_exit(iosf_mbi_exit);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-70-g09d2


From 9d05041679904b12c12421cbcf9cb5f4860a8d7b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 15 Jul 2015 10:29:33 -0700
Subject: x86/nmi: Enable nested do_nmi() handling for 64-bit kernels

32-bit kernels handle nested NMIs in C.  Enable the exact same
handling on 64-bit kernels as well.  This isn't currently
necessary, but it will become necessary once the asm code starts
allowing limited nesting.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/nmi.c | 123 +++++++++++++++++++++-----------------------------
 1 file changed, 52 insertions(+), 71 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c3e985d1751c..d8766b1c9974 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
 NOKPROBE_SYMBOL(default_do_nmi);
 
 /*
- * NMIs can hit breakpoints which will cause it to lose its
- * NMI context with the CPU when the breakpoint does an iret.
- */
-#ifdef CONFIG_X86_32
-/*
- * For i386, NMIs use the same stack as the kernel, and we can
- * add a workaround to the iret problem in C (preventing nested
- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
- * can be in:
+ * NMIs can hit breakpoints which will cause it to lose its NMI context
+ * with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing.  On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
  *
  *  1) not running
  *  2) executing
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
  * (Note, the latch is binary, thus multiple NMIs triggering,
  *  when one is running, are ignored. Only one NMI is restarted.)
  *
- * If an NMI hits a breakpoint that executes an iret, another
- * NMI can preempt it. We do not want to allow this new NMI
- * to run, but we want to execute it when the first one finishes.
- * We set the state to "latched", and the exit of the first NMI will
- * perform a dec_return, if the result is zero (NOT_RUNNING), then
- * it will simply exit the NMI handler. If not, the dec_return
- * would have set the state to NMI_EXECUTING (what we want it to
- * be when we are running). In this case, we simply jump back
- * to rerun the NMI handler again, and restart the 'latched' NMI.
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes.  We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
  *
  * No trap (breakpoint or page fault) should be hit before nmi_restart,
  * thus there is no race between the first check of state for NOT_RUNNING
@@ -461,49 +460,36 @@ enum nmi_states {
 static DEFINE_PER_CPU(enum nmi_states, nmi_state);
 static DEFINE_PER_CPU(unsigned long, nmi_cr2);
 
-#define nmi_nesting_preprocess(regs)					\
-	do {								\
-		if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {	\
-			this_cpu_write(nmi_state, NMI_LATCHED);		\
-			return;						\
-		}							\
-		this_cpu_write(nmi_state, NMI_EXECUTING);		\
-		this_cpu_write(nmi_cr2, read_cr2());			\
-	} while (0);							\
-	nmi_restart:
-
-#define nmi_nesting_postprocess()					\
-	do {								\
-		if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))	\
-			write_cr2(this_cpu_read(nmi_cr2));		\
-		if (this_cpu_dec_return(nmi_state))			\
-			goto nmi_restart;				\
-	} while (0)
-#else /* x86_64 */
+#ifdef CONFIG_X86_64
 /*
- * In x86_64 things are a bit more difficult. This has the same problem
- * where an NMI hitting a breakpoint that calls iret will remove the
- * NMI context, allowing a nested NMI to enter. What makes this more
- * difficult is that both NMIs and breakpoints have their own stack.
- * When a new NMI or breakpoint is executed, the stack is set to a fixed
- * point. If an NMI is nested, it will have its stack set at that same
- * fixed address that the first NMI had, and will start corrupting the
- * stack. This is handled in entry_64.S, but the same problem exists with
- * the breakpoint stack.
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint.  Without
+ * some care, the inner breakpoint will clobber the outer breakpoint's
+ * stack.
  *
- * If a breakpoint is being processed, and the debug stack is being used,
- * if an NMI comes in and also hits a breakpoint, the stack pointer
- * will be set to the same fixed address as the breakpoint that was
- * interrupted, causing that stack to be corrupted. To handle this case,
- * check if the stack that was interrupted is the debug stack, and if
- * so, change the IDT so that new breakpoints will use the current stack
- * and not switch to the fixed address. On return of the NMI, switch back
- * to the original IDT.
+ * If a breakpoint is being processed, and the debug stack is being
+ * used, if an NMI comes in and also hits a breakpoint, the stack
+ * pointer will be set to the same fixed address as the breakpoint that
+ * was interrupted, causing that stack to be corrupted. To handle this
+ * case, check if the stack that was interrupted is the debug stack, and
+ * if so, change the IDT so that new breakpoints will use the current
+ * stack and not switch to the fixed address. On return of the NMI,
+ * switch back to the original IDT.
  */
 static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
 
-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+dotraplinkage notrace void
+do_nmi(struct pt_regs *regs, long error_code)
 {
+	if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+		this_cpu_write(nmi_state, NMI_LATCHED);
+		return;
+	}
+	this_cpu_write(nmi_state, NMI_EXECUTING);
+	this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+
+#ifdef CONFIG_X86_64
 	/*
 	 * If we interrupted a breakpoint, it is possible that
 	 * the nmi handler will have breakpoints too. We need to
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
 		debug_stack_set_zero();
 		this_cpu_write(update_debug_stack, 1);
 	}
-}
-
-static inline void nmi_nesting_postprocess(void)
-{
-	if (unlikely(this_cpu_read(update_debug_stack))) {
-		debug_stack_reset();
-		this_cpu_write(update_debug_stack, 0);
-	}
-}
 #endif
 
-dotraplinkage notrace void
-do_nmi(struct pt_regs *regs, long error_code)
-{
-	nmi_nesting_preprocess(regs);
-
 	nmi_enter();
 
 	inc_irq_stat(__nmi_count);
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
 
 	nmi_exit();
 
-	/* On i386, may loop back to preprocess */
-	nmi_nesting_postprocess();
+#ifdef CONFIG_X86_64
+	if (unlikely(this_cpu_read(update_debug_stack))) {
+		debug_stack_reset();
+		this_cpu_write(update_debug_stack, 0);
+	}
+#endif
+
+	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+		write_cr2(this_cpu_read(nmi_cr2));
+	if (this_cpu_dec_return(nmi_state))
+		goto nmi_restart;
 }
 NOKPROBE_SYMBOL(do_nmi);
 
-- 
cgit v1.2.3-70-g09d2


From 0b22930ebad563ae97ff3f8d7b9f12060b4c6e6b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 15 Jul 2015 10:29:36 -0700
Subject: x86/nmi/64: Improve nested NMI comments

I found the nested NMI documentation to be difficult to follow.
Improve the comments.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 158 +++++++++++++++++++++++++++-------------------
 arch/x86/kernel/nmi.c     |   4 +-
 2 files changed, 94 insertions(+), 68 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 8668bbdd2bca..f54d63a60a3b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1237,11 +1237,12 @@ ENTRY(nmi)
 	 *  If the variable is not set and the stack is not the NMI
 	 *  stack then:
 	 *    o Set the special variable on the stack
-	 *    o Copy the interrupt frame into a "saved" location on the stack
-	 *    o Copy the interrupt frame into a "copy" location on the stack
+	 *    o Copy the interrupt frame into an "outermost" location on the
+	 *      stack
+	 *    o Copy the interrupt frame into an "iret" location on the stack
 	 *    o Continue processing the NMI
 	 *  If the variable is set or the previous stack is the NMI stack:
-	 *    o Modify the "copy" location to jump to the repeate_nmi
+	 *    o Modify the "iret" location to jump to the repeat_nmi
 	 *    o return back to the first NMI
 	 *
 	 * Now on exit of the first NMI, we first clear the stack variable
@@ -1317,18 +1318,60 @@ ENTRY(nmi)
 
 .Lnmi_from_kernel:
 	/*
-	 * Check the special variable on the stack to see if NMIs are
-	 * executing.
+	 * Here's what our stack frame will look like:
+	 * +---------------------------------------------------------+
+	 * | original SS                                             |
+	 * | original Return RSP                                     |
+	 * | original RFLAGS                                         |
+	 * | original CS                                             |
+	 * | original RIP                                            |
+	 * +---------------------------------------------------------+
+	 * | temp storage for rdx                                    |
+	 * +---------------------------------------------------------+
+	 * | "NMI executing" variable                                |
+	 * +---------------------------------------------------------+
+	 * | iret SS          } Copied from "outermost" frame        |
+	 * | iret Return RSP  } on each loop iteration; overwritten  |
+	 * | iret RFLAGS      } by a nested NMI to force another     |
+	 * | iret CS          } iteration if needed.                 |
+	 * | iret RIP         }                                      |
+	 * +---------------------------------------------------------+
+	 * | outermost SS          } initialized in first_nmi;       |
+	 * | outermost Return RSP  } will not be changed before      |
+	 * | outermost RFLAGS      } NMI processing is done.         |
+	 * | outermost CS          } Copied to "iret" frame on each  |
+	 * | outermost RIP         } iteration.                      |
+	 * +---------------------------------------------------------+
+	 * | pt_regs                                                 |
+	 * +---------------------------------------------------------+
+	 *
+	 * The "original" frame is used by hardware.  Before re-enabling
+	 * NMIs, we need to be done with it, and we need to leave enough
+	 * space for the asm code here.
+	 *
+	 * We return by executing IRET while RSP points to the "iret" frame.
+	 * That will either return for real or it will loop back into NMI
+	 * processing.
+	 *
+	 * The "outermost" frame is copied to the "iret" frame on each
+	 * iteration of the loop, so each iteration starts with the "iret"
+	 * frame pointing to the final return target.
+	 */
+
+	/*
+	 * Determine whether we're a nested NMI.
+	 *
+	 * First check "NMI executing".  If it's set, then we're nested.
+	 * This will not detect if we interrupted an outer NMI just
+	 * before IRET.
 	 */
 	cmpl	$1, -8(%rsp)
 	je	nested_nmi
 
 	/*
-	 * Now test if the previous stack was an NMI stack.
-	 * We need the double check. We check the NMI stack to satisfy the
-	 * race when the first NMI clears the variable before returning.
-	 * We check the variable because the first NMI could be in a
-	 * breakpoint routine using a breakpoint stack.
+	 * Now test if the previous stack was an NMI stack.  This covers
+	 * the case where we interrupt an outer NMI after it clears
+	 * "NMI executing" but before IRET.
 	 */
 	lea	6*8(%rsp), %rdx
 	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1344,9 +1387,11 @@ ENTRY(nmi)
 
 nested_nmi:
 	/*
-	 * Do nothing if we interrupted the fixup in repeat_nmi.
-	 * It's about to repeat the NMI handler, so we are fine
-	 * with ignoring this one.
+	 * If we interrupted an NMI that is between repeat_nmi and
+	 * end_repeat_nmi, then we must not modify the "iret" frame
+	 * because it's being written by the outer NMI.  That's okay;
+	 * the outer NMI handler is about to call do_nmi anyway,
+	 * so we can just resume the outer NMI.
 	 */
 	movq	$repeat_nmi, %rdx
 	cmpq	8(%rsp), %rdx
@@ -1356,7 +1401,10 @@ nested_nmi:
 	ja	nested_nmi_out
 
 1:
-	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
+	/*
+	 * Modify the "iret" frame to point to repeat_nmi, forcing another
+	 * iteration of NMI handling.
+	 */
 	leaq	-1*8(%rsp), %rdx
 	movq	%rdx, %rsp
 	leaq	-10*8(%rsp), %rdx
@@ -1372,61 +1420,27 @@ nested_nmi:
 nested_nmi_out:
 	popq	%rdx
 
-	/* No need to check faults here */
+	/* We are returning to kernel mode, so this cannot result in a fault. */
 	INTERRUPT_RETURN
 
 first_nmi:
-	/*
-	 * Because nested NMIs will use the pushed location that we
-	 * stored in rdx, we must keep that space available.
-	 * Here's what our stack frame will look like:
-	 * +-------------------------+
-	 * | original SS             |
-	 * | original Return RSP     |
-	 * | original RFLAGS         |
-	 * | original CS             |
-	 * | original RIP            |
-	 * +-------------------------+
-	 * | temp storage for rdx    |
-	 * +-------------------------+
-	 * | NMI executing variable  |
-	 * +-------------------------+
-	 * | copied SS               |
-	 * | copied Return RSP       |
-	 * | copied RFLAGS           |
-	 * | copied CS               |
-	 * | copied RIP              |
-	 * +-------------------------+
-	 * | Saved SS                |
-	 * | Saved Return RSP        |
-	 * | Saved RFLAGS            |
-	 * | Saved CS                |
-	 * | Saved RIP               |
-	 * +-------------------------+
-	 * | pt_regs                 |
-	 * +-------------------------+
-	 *
-	 * The saved stack frame is used to fix up the copied stack frame
-	 * that a nested NMI may change to make the interrupted NMI iret jump
-	 * to the repeat_nmi. The original stack frame and the temp storage
-	 * is also used by nested NMIs and can not be trusted on exit.
-	 */
-	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+	/* Restore rdx. */
 	movq	(%rsp), %rdx
 
-	/* Set the NMI executing variable on the stack. */
+	/* Set "NMI executing" on the stack. */
 	pushq	$1
 
-	/* Leave room for the "copied" frame */
+	/* Leave room for the "iret" frame */
 	subq	$(5*8), %rsp
 
-	/* Copy the stack frame to the Saved frame */
+	/* Copy the "original" frame to the "outermost" frame */
 	.rept 5
 	pushq	11*8(%rsp)
 	.endr
 
 	/* Everything up to here is safe from nested NMIs */
 
+repeat_nmi:
 	/*
 	 * If there was a nested NMI, the first NMI's iret will return
 	 * here. But NMIs are still enabled and we can take another
@@ -1435,16 +1449,21 @@ first_nmi:
 	 * it will just return, as we are about to repeat an NMI anyway.
 	 * This makes it safe to copy to the stack frame that a nested
 	 * NMI will update.
-	 */
-repeat_nmi:
-	/*
-	 * Update the stack variable to say we are still in NMI (the update
-	 * is benign for the non-repeat case, where 1 was pushed just above
-	 * to this very stack slot).
+	 *
+	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
+	 * we're repeating an NMI, gsbase has the same value that it had on
+	 * the first iteration.  paranoid_entry will load the kernel
+	 * gsbase if needed before we call do_nmi.
+	 *
+	 * Set "NMI executing" in case we came back here via IRET.
 	 */
 	movq	$1, 10*8(%rsp)
 
-	/* Make another copy, this one may be modified by nested NMIs */
+	/*
+	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
+	 * here must not modify the "iret" frame while we're writing to
+	 * it or it will end up containing garbage.
+	 */
 	addq	$(10*8), %rsp
 	.rept 5
 	pushq	-6*8(%rsp)
@@ -1453,9 +1472,9 @@ repeat_nmi:
 end_repeat_nmi:
 
 	/*
-	 * Everything below this point can be preempted by a nested
-	 * NMI if the first NMI took an exception and reset our iret stack
-	 * so that we repeat another NMI.
+	 * Everything below this point can be preempted by a nested NMI.
+	 * If this happens, then the inner NMI will change the "iret"
+	 * frame to point back to repeat_nmi.
 	 */
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	ALLOC_PT_GPREGS_ON_STACK
@@ -1481,11 +1500,18 @@ nmi_swapgs:
 nmi_restore:
 	RESTORE_EXTRA_REGS
 	RESTORE_C_REGS
-	/* Pop the extra iret frame at once */
+
+	/* Point RSP at the "iret" frame. */
 	REMOVE_PT_GPREGS_FROM_STACK 6*8
 
-	/* Clear the NMI executing stack variable */
+	/* Clear "NMI executing". */
 	movq	$0, 5*8(%rsp)
+
+	/*
+	 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+	 * stack in a single instruction.  We are returning to kernel
+	 * mode, so this cannot result in a fault.
+	 */
 	INTERRUPT_RETURN
 END(nmi)
 
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index d8766b1c9974..d05bd2e2ee91 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,8 +408,8 @@ static void default_do_nmi(struct pt_regs *regs)
 NOKPROBE_SYMBOL(default_do_nmi);
 
 /*
- * NMIs can hit breakpoints which will cause it to lose its NMI context
- * with the CPU when the breakpoint or page fault does an IRET.
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
  *
  * As a result, NMIs can nest if NMIs get unmasked due an IRET during
  * NMI processing.  On x86_64, the asm glue protects us from nested NMIs
-- 
cgit v1.2.3-70-g09d2


From 4d7489ffba0aef4d2c708b6ff1428efd6ccf41df Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 10 Jul 2015 21:47:36 +0100
Subject: nmi: x86: convert to generic nmi handler

Convert x86 to use the generic nmi handler code which can be shared
between architectures.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/x86/kernel/apic/hw_nmi.c | 133 ++----------------------------------------
 1 file changed, 4 insertions(+), 129 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..045e424fb368 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
 #endif
 
 #ifdef arch_trigger_all_cpu_backtrace
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE		4096
-
-struct nmi_seq_buf {
-	unsigned char		buffer[NMI_BUF_SIZE];
-	struct seq_buf		seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
-
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
-static unsigned long backtrace_flag;
-
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
 {
-	const char *buf = s->buffer + start;
-
-	printk("%.*s", (end - start) + 1, buf);
+	apic->send_IPI_mask(mask, NMI_VECTOR);
 }
 
 void arch_trigger_all_cpu_backtrace(bool include_self)
 {
-	struct nmi_seq_buf *s;
-	int len;
-	int cpu;
-	int i;
-	int this_cpu = get_cpu();
-
-	if (test_and_set_bit(0, &backtrace_flag)) {
-		/*
-		 * If there is already a trigger_all_cpu_backtrace() in progress
-		 * (backtrace_flag == 1), don't output double cpu dump infos.
-		 */
-		put_cpu();
-		return;
-	}
-
-	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-	if (!include_self)
-		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
-
-	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-	/*
-	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
-	 * CPUs will write to.
-	 */
-	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-		s = &per_cpu(nmi_print_seq, cpu);
-		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-	}
-
-	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
-		pr_info("sending NMI to %s CPUs:\n",
-			(include_self ? "all" : "other"));
-		apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
-	}
-
-	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
-	for (i = 0; i < 10 * 1000; i++) {
-		if (cpumask_empty(to_cpumask(backtrace_mask)))
-			break;
-		mdelay(1);
-		touch_softlockup_watchdog();
-	}
-
-	/*
-	 * Now that all the NMIs have triggered, we can dump out their
-	 * back traces safely to the console.
-	 */
-	for_each_cpu(cpu, &printtrace_mask) {
-		int last_i = 0;
-
-		s = &per_cpu(nmi_print_seq, cpu);
-		len = seq_buf_used(&s->seq);
-		if (!len)
-			continue;
-
-		/* Print line by line. */
-		for (i = 0; i < len; i++) {
-			if (s->buffer[i] == '\n') {
-				print_seq_line(s, last_i, i);
-				last_i = i + 1;
-			}
-		}
-		/* Check if there was a partial line. */
-		if (last_i < len) {
-			print_seq_line(s, last_i, len - 1);
-			pr_cont("\n");
-		}
-	}
-
-	clear_bit(0, &backtrace_flag);
-	smp_mb__after_atomic();
-	put_cpu();
-}
-
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	unsigned int len = seq_buf_used(&s->seq);
-
-	seq_buf_vprintf(&s->seq, fmt, args);
-	return seq_buf_used(&s->seq) - len;
+	nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
 }
 
 static int
 arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
 {
-	int cpu;
-
-	cpu = smp_processor_id();
-
-	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-		/* Replace printk to write into the NMI seq */
-		this_cpu_write(printk_func, nmi_vprintk);
-		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
-		show_regs(regs);
-		this_cpu_write(printk_func, printk_func_save);
-
-		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+	if (nmi_cpu_backtrace(regs))
 		return NMI_HANDLED;
-	}
 
 	return NMI_DONE;
 }
-- 
cgit v1.2.3-70-g09d2


From 0c8c0f03e3a292e031596484275c14cf39c0ab7a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@sr71.net>
Date: Fri, 17 Jul 2015 12:28:11 +0200
Subject: x86/fpu, sched: Dynamically allocate 'struct fpu'

The FPU rewrite removed the dynamic allocations of 'struct fpu'.
But, this potentially wastes massive amounts of memory (2k per
task on systems that do not have AVX-512 for instance).

Instead of having a separate slab, this patch just appends the
space that we need to the 'task_struct' which we dynamically
allocate already.  This saves from doing an extra slab
allocation at fork().

The only real downside here is that we have to stick everything
and the end of the task_struct.  But, I think the
BUILD_BUG_ON()s I stuck in there should keep that from being too
fragile.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437128892-9831-2-git-send-email-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/types.h | 72 +++++++++++++++++++++-------------------
 arch/x86/include/asm/processor.h | 10 ++++--
 arch/x86/kernel/fpu/init.c       | 39 ++++++++++++++++++++++
 arch/x86/kernel/process.c        |  2 +-
 fs/proc/kcore.c                  |  4 +--
 include/linux/sched.h            | 12 +++++--
 kernel/fork.c                    |  8 ++++-
 7 files changed, 104 insertions(+), 43 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0637826292de..c49c5173158e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
 	struct fxregs_state		fxsave;
 	struct swregs_state		soft;
 	struct xregs_state		xsave;
+	u8 __padding[PAGE_SIZE];
 };
 
 /*
@@ -197,40 +198,6 @@ union fpregs_state {
  * state fields:
  */
 struct fpu {
-	/*
-	 * @state:
-	 *
-	 * In-memory copy of all FPU registers that we save/restore
-	 * over context switches. If the task is using the FPU then
-	 * the registers in the FPU are more recent than this state
-	 * copy. If the task context-switches away then they get
-	 * saved here and represent the FPU state.
-	 *
-	 * After context switches there may be a (short) time period
-	 * during which the in-FPU hardware registers are unchanged
-	 * and still perfectly match this state, if the tasks
-	 * scheduled afterwards are not using the FPU.
-	 *
-	 * This is the 'lazy restore' window of optimization, which
-	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
-	 *
-	 * We detect whether a subsequent task uses the FPU via setting
-	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
-	 *
-	 * During this window, if the task gets scheduled again, we
-	 * might be able to skip having to do a restore from this
-	 * memory buffer to the hardware registers - at the cost of
-	 * incurring the overhead of #NM fault traps.
-	 *
-	 * Note that on modern CPUs that support the XSAVEOPT (or other
-	 * optimized XSAVE instructions), we don't use #NM traps anymore,
-	 * as the hardware can track whether FPU registers need saving
-	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
-	 * logic, which unconditionally saves/restores all FPU state
-	 * across context switches. (if FPU state exists.)
-	 */
-	union fpregs_state		state;
-
 	/*
 	 * @last_cpu:
 	 *
@@ -288,6 +255,43 @@ struct fpu {
 	 * deal with bursty apps that only use the FPU for a short time:
 	 */
 	unsigned char			counter;
+	/*
+	 * @state:
+	 *
+	 * In-memory copy of all FPU registers that we save/restore
+	 * over context switches. If the task is using the FPU then
+	 * the registers in the FPU are more recent than this state
+	 * copy. If the task context-switches away then they get
+	 * saved here and represent the FPU state.
+	 *
+	 * After context switches there may be a (short) time period
+	 * during which the in-FPU hardware registers are unchanged
+	 * and still perfectly match this state, if the tasks
+	 * scheduled afterwards are not using the FPU.
+	 *
+	 * This is the 'lazy restore' window of optimization, which
+	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+	 *
+	 * We detect whether a subsequent task uses the FPU via setting
+	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+	 *
+	 * During this window, if the task gets scheduled again, we
+	 * might be able to skip having to do a restore from this
+	 * memory buffer to the hardware registers - at the cost of
+	 * incurring the overhead of #NM fault traps.
+	 *
+	 * Note that on modern CPUs that support the XSAVEOPT (or other
+	 * optimized XSAVE instructions), we don't use #NM traps anymore,
+	 * as the hardware can track whether FPU registers need saving
+	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+	 * logic, which unconditionally saves/restores all FPU state
+	 * across context switches. (if FPU state exists.)
+	 */
+	union fpregs_state		state;
+	/*
+	 * WARNING: 'state' is dynamically-sized.  Do not put
+	 * anything after it here.
+	 */
 };
 
 #endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..944f1785ed0d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -390,9 +390,6 @@ struct thread_struct {
 #endif
 	unsigned long		gs;
 
-	/* Floating point and extended processor state */
-	struct fpu		fpu;
-
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
 	/* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
+
+	/* Floating point and extended processor state */
+	struct fpu		fpu;
+	/*
+	 * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
+	 * the end.
+	 */
 };
 
 /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 32826791e675..deacbfa6b33e 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -136,6 +136,45 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER)	\
+	BUILD_BUG_ON((sizeof(TYPE) -			\
+			offsetof(TYPE, MEMBER) -	\
+			sizeof(((TYPE *)0)->MEMBER)) > 	\
+			0)				\
+
+/*
+ * We append the 'struct fpu' to the task_struct.
+ */
+int __weak arch_task_struct_size(void)
+{
+	int task_size = sizeof(struct task_struct);
+
+	/*
+	 * Subtract off the static size of the register state.
+	 * It potentially has a bunch of padding.
+	 */
+	task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+	/*
+	 * Add back the dynamically-calculated register state
+	 * size.
+	 */
+	task_size += xstate_size;
+
+	/*
+	 * We dynamically size 'struct fpu', so we require that
+	 * it be at the end of 'thread_struct' and that
+	 * 'thread_struct' be at the end of 'task_struct'.  If
+	 * you hit a compile error here, check the structure to
+	 * see if something got added to the end.
+	 */
+	CHECK_MEMBER_AT_END_OF(struct fpu, state);
+	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+	return task_size;
+}
+
 /*
  * Set up the xstate_size based on the legacy FPU context size.
  *
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9cad694ed7c4..975420eac105 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	*dst = *src;
+	memcpy(dst, src, arch_task_struct_size());
 
 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 91a4e6426321..a0fe99485687 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(sizeof(struct task_struct), 4);
+			roundup(arch_task_struct_size(), 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= sizeof(struct task_struct);
+	notes[2].datasz	= arch_task_struct_size();
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae21f1591615..e43a41d892b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1522,8 +1522,6 @@ struct task_struct {
 /* hung task detection */
 	unsigned long last_switch_count;
 #endif
-/* CPU-specific state of this task */
-	struct thread_struct thread;
 /* filesystem information */
 	struct fs_struct *fs;
 /* open file information */
@@ -1778,8 +1776,18 @@ struct task_struct {
 	unsigned long	task_state_change;
 #endif
 	int pagefault_disabled;
+/* CPU-specific state of this task */
+	struct thread_struct thread;
+/*
+ * WARNING: on x86, 'thread_struct' contains a variable-sized
+ * structure.  It *MUST* be at the end of 'task_struct'.
+ *
+ * Do not put anything below here!
+ */
 };
 
+extern int arch_task_struct_size(void);
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..431b67a6098c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,15 +287,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
+int __weak arch_task_struct_size(void)
+{
+	return sizeof(struct task_struct);
+}
+
 void __init fork_init(void)
 {
+	int task_struct_size = arch_task_struct_size();
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", sizeof(struct task_struct),
+		kmem_cache_create("task_struct", task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 5aaeb5c01c5b6c0be7b7aadbf3ace9f3a4458c3d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 17 Jul 2015 12:28:12 +0200
Subject: x86/fpu, sched: Introduce CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and
 use it on x86

Don't burden architectures without dynamic task_struct sizing
with the overhead of dynamic sizing.

Also optimize the x86 code a bit by caching task_struct_size.

Acked-and-Tested-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437128892-9831-3-git-send-email-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/Kconfig               |  4 ++++
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/fpu/init.c | 17 +++++++++--------
 arch/x86/kernel/process.c  |  2 +-
 fs/proc/kcore.c            |  4 ++--
 include/linux/sched.h      |  6 +++++-
 kernel/fork.c              | 11 +++++------
 7 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index bec6666a3cc4..8a8ea7110de8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR
 config ARCH_THREAD_INFO_ALLOCATOR
 	bool
 
+# Select if arch wants to size task_struct dynamically via arch_task_struct_size:
+config ARCH_WANTS_DYNAMIC_TASK_STRUCT
+	bool
+
 config HAVE_REGS_AND_STACK_ACCESS_API
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3dbb7e7909ca..b3a1a5d77d92 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
+	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index deacbfa6b33e..0b39173dd971 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
 #include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
 
+#include <linux/sched.h>
+
 /*
  * Initialize the TS bit in CR0 according to the style of context-switches
  * we are using:
@@ -136,16 +138,14 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
-#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER)	\
-	BUILD_BUG_ON((sizeof(TYPE) -			\
-			offsetof(TYPE, MEMBER) -	\
-			sizeof(((TYPE *)0)->MEMBER)) > 	\
-			0)				\
+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+	BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
 
 /*
- * We append the 'struct fpu' to the task_struct.
+ * We append the 'struct fpu' to the task_struct:
  */
-int __weak arch_task_struct_size(void)
+static void __init fpu__init_task_struct_size(void)
 {
 	int task_size = sizeof(struct task_struct);
 
@@ -172,7 +172,7 @@ int __weak arch_task_struct_size(void)
 	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
 	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
 
-	return task_size;
+	arch_task_struct_size = task_size;
 }
 
 /*
@@ -326,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 	fpu__init_system_generic();
 	fpu__init_system_xstate_size_legacy();
 	fpu__init_system_xstate();
+	fpu__init_task_struct_size();
 
 	fpu__init_system_ctx_switch();
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 975420eac105..397688beed4b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	memcpy(dst, src, arch_task_struct_size());
+	memcpy(dst, src, arch_task_struct_size);
 
 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a0fe99485687..92e6726f6e37 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(arch_task_struct_size(), 4);
+			roundup(arch_task_struct_size, 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= arch_task_struct_size();
+	notes[2].datasz	= arch_task_struct_size;
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e43a41d892b6..04b5ada460b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1786,7 +1786,11 @@ struct task_struct {
  */
 };
 
-extern int arch_task_struct_size(void);
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+extern int arch_task_struct_size __read_mostly;
+#else
+# define arch_task_struct_size (sizeof(struct task_struct))
+#endif
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
diff --git a/kernel/fork.c b/kernel/fork.c
index 431b67a6098c..dbd9b8d7b7cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,21 +287,20 @@ static void set_max_threads(unsigned int max_threads_suggested)
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
-int __weak arch_task_struct_size(void)
-{
-	return sizeof(struct task_struct);
-}
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
 
 void __init fork_init(void)
 {
-	int task_struct_size = arch_task_struct_size();
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", task_struct_size,
+		kmem_cache_create("task_struct", arch_task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 4daa832d99871356f5fdc52372c975e40f73a15e Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Mon, 20 Jul 2015 18:32:53 +0200
Subject: x86: Drop bogus __ref / __refdata annotations

The __ref / __refdata annotations used to be needed because of
referencing functions / variables annotated __cpuinit /
__cpuinitdata.

But those annotations vanished during the development of v3.11.

Therefore most of the __ref / __refdata annotations are not needed
anymore. As they may hide legitimate sections mismatches, we
better get rid of them.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437409973-8927-1-git-send-email-minipli@googlemail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/acpi/boot.c                 | 8 +-------
 arch/x86/kernel/apic/x2apic_cluster.c       | 2 +-
 arch/x86/kernel/cpu/microcode/core.c        | 2 +-
 arch/x86/kernel/cpu/microcode/intel_early.c | 2 +-
 arch/x86/kernel/cpuid.c                     | 2 +-
 arch/x86/kernel/smpboot.c                   | 2 +-
 arch/x86/kernel/topology.c                  | 4 ++--
 7 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e49ee24da85e..75e8bad53798 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -710,7 +710,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 #endif
 }
 
-static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
 {
 	int cpu;
 
@@ -726,12 +726,6 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
 	*pcpu = cpu;
 	return 0;
 }
-
-/* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
-{
-	return _acpi_map_lsapic(handle, physid, pcpu);
-}
 EXPORT_SYMBOL(acpi_map_cpu);
 
 int acpi_unmap_cpu(int cpu)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ab3219b3fbda..e709bc276feb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -182,7 +182,7 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return notifier_from_errno(err);
 }
 
-static struct notifier_block __refdata x2apic_cpu_notifier = {
+static struct notifier_block x2apic_cpu_notifier = {
 	.notifier_call = update_clusterinfo,
 };
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 6236a54a63f4..532026d48096 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -460,7 +460,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __refdata mc_cpu_notifier = {
+static struct notifier_block mc_cpu_notifier = {
 	.notifier_call	= mc_cpu_callback,
 };
 
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 8187b7247d1c..37ea89c11520 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -390,7 +390,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 }
 
 #ifdef DEBUG
-static void __ref show_saved_mc(void)
+static void show_saved_mc(void)
 {
 	int i, j;
 	unsigned int sig, pf, rev, total_size, data_size, date;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 83741a71558f..bd3507da39f0 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb,
 	return notifier_from_errno(err);
 }
 
-static struct notifier_block __refdata cpuid_class_cpu_notifier =
+static struct notifier_block cpuid_class_cpu_notifier =
 {
 	.notifier_call = cpuid_class_cpu_callback,
 };
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b1f3ed9c7a9e..1d06cf8c0093 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1358,7 +1358,7 @@ static void remove_siblinginfo(int cpu)
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
 }
 
-static void __ref remove_cpu_from_maps(int cpu)
+static void remove_cpu_from_maps(int cpu)
 {
 	set_cpu_online(cpu, false);
 	cpumask_clear_cpu(cpu, cpu_callout_mask);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 649b010da00b..12cbe2b88c0f 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -57,7 +57,7 @@ __setup("cpu0_hotplug", enable_cpu0_hotplug);
  *
  * This is only called for debugging CPU offline/online feature.
  */
-int __ref _debug_hotplug_cpu(int cpu, int action)
+int _debug_hotplug_cpu(int cpu, int action)
 {
 	struct device *dev = get_cpu_device(cpu);
 	int ret;
@@ -104,7 +104,7 @@ static int __init debug_hotplug_cpu(void)
 late_initcall_sync(debug_hotplug_cpu);
 #endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
 
-int __ref arch_register_cpu(int num)
+int arch_register_cpu(int num)
 {
 	struct cpuinfo_x86 *c = &cpu_data(num);
 
-- 
cgit v1.2.3-70-g09d2


From b51ef52df71cb28e9d90cd1d48b79bf19f0bab06 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@fedoraproject.org>
Date: Mon, 20 Jul 2015 14:47:58 -0700
Subject: x86/cpu: Restore MSR_IA32_ENERGY_PERF_BIAS after resume

MSR_IA32_ENERGY_PERF_BIAS is lost after suspend/resume:

	x86_energy_perf_policy -r before

	cpu0: 0x0000000000000006
	cpu1: 0x0000000000000006
	cpu2: 0x0000000000000006
	cpu3: 0x0000000000000006
	cpu4: 0x0000000000000006
	cpu5: 0x0000000000000006
	cpu6: 0x0000000000000006
	cpu7: 0x0000000000000006

	after

	cpu0: 0x0000000000000000
	cpu1: 0x0000000000000006
	cpu2: 0x0000000000000006
	cpu3: 0x0000000000000006
	cpu4: 0x0000000000000006
	cpu5: 0x0000000000000006
	cpu6: 0x0000000000000006
	cpu7: 0x0000000000000006

Resulting in inconsistent energy policy settings across CPUs.

This register is set via init_intel() at bootup. During resume,
the secondary CPUs are brought online again and init_intel() is
callled which re-initializes the register. The boot CPU however
never reinitializes the register.

Add a syscore callback to reinitialize the register for the boot CPU.

Signed-off-by: Laura Abbott <labbott@fedoraproject.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437428878-4105-1-git-send-email-labbott@fedoraproject.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 18 +++++++++++++++++
 arch/x86/kernel/cpu/cpu.h    |  1 +
 arch/x86/kernel/cpu/intel.c  | 47 ++++++++++++++++++++++++++++++--------------
 3 files changed, 51 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 922c5e0cea4c..fa95bb8829ce 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
 #include <linux/kgdb.h>
 #include <linux/smp.h>
 #include <linux/io.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/stackprotector.h>
 #include <asm/perf_event.h>
@@ -1488,3 +1489,20 @@ inline bool __static_cpu_has_safe(u16 bit)
 	return boot_cpu_has(bit);
 }
 EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
+
+static void bsp_resume(void)
+{
+	if (this_cpu->c_bsp_resume)
+		this_cpu->c_bsp_resume(&boot_cpu_data);
+}
+
+static struct syscore_ops cpu_syscore_ops = {
+	.resume		= bsp_resume,
+};
+
+static int __init init_cpu_syscore(void)
+{
+	register_syscore_ops(&cpu_syscore_ops);
+	return 0;
+}
+core_initcall(init_cpu_syscore);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index c37dc37e8317..2584265d4745 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -13,6 +13,7 @@ struct cpu_dev {
 	void		(*c_init)(struct cpuinfo_x86 *);
 	void		(*c_identify)(struct cpuinfo_x86 *);
 	void		(*c_detect_tlb)(struct cpuinfo_x86 *);
+	void		(*c_bsp_resume)(struct cpuinfo_x86 *);
 	int		c_x86_vendor;
 #ifdef CONFIG_X86_32
 	/* Optional vendor specific routine to obtain the cache size. */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 50163fa9034f..98a13db5f4be 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -371,6 +371,36 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
 	}
 }
 
+static void init_intel_energy_perf(struct cpuinfo_x86 *c)
+{
+	u64 epb;
+
+	/*
+	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized.
+	 * (x86_energy_perf_policy(8) is available to change it at run-time.)
+	 */
+	if (!cpu_has(c, X86_FEATURE_EPB))
+		return;
+
+	rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+	if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
+		return;
+
+	pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+	pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
+	epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+	wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+}
+
+static void intel_bsp_resume(struct cpuinfo_x86 *c)
+{
+	/*
+	 * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume,
+	 * so reinitialize it properly like during bootup:
+	 */
+	init_intel_energy_perf(c);
+}
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	unsigned int l2 = 0;
@@ -478,21 +508,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);
 
-	/*
-	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
-	 * x86_energy_perf_policy(8) is available to change it at run-time
-	 */
-	if (cpu_has(c, X86_FEATURE_EPB)) {
-		u64 epb;
-
-		rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
-		if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
-			pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
-			pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
-			epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
-			wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
-		}
-	}
+	init_intel_energy_perf(c);
 }
 
 #ifdef CONFIG_X86_32
@@ -747,6 +763,7 @@ static const struct cpu_dev intel_cpu_dev = {
 	.c_detect_tlb	= intel_detect_tlb,
 	.c_early_init   = early_init_intel,
 	.c_init		= init_intel,
+	.c_bsp_resume	= intel_bsp_resume,
 	.c_x86_vendor	= X86_VENDOR_INTEL,
 };
 
-- 
cgit v1.2.3-70-g09d2


From bf9f2ee28d475ada0005c59382852cb70f1419ac Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Jul 2015 11:52:23 -0700
Subject: x86/nmi: Remove the 'b2b' parameter from nmi_handle()

It has never had any effect. Remove it for comprehensibility.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c91fa38507760d9e54a4b8737fa6409bde896b33.1437418322.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/nmi.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c3e985d1751c..f76d6500f458 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
 		a->handler, whole_msecs, decimal_msecs);
 }
 
-static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
 {
 	struct nmi_desc *desc = nmi_to_desc(type);
 	struct nmiaction *a;
@@ -213,7 +213,7 @@ static void
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 {
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_SERR, regs, false))
+	if (nmi_handle(NMI_SERR, regs))
 		return;
 
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	unsigned long i;
 
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_IO_CHECK, regs, false))
+	if (nmi_handle(NMI_IO_CHECK, regs))
 		return;
 
 	pr_emerg(
@@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 	 * as only the first one is ever run (unless it can actually determine
 	 * if it caused the NMI)
 	 */
-	handled = nmi_handle(NMI_UNKNOWN, regs, false);
+	handled = nmi_handle(NMI_UNKNOWN, regs);
 	if (handled) {
 		__this_cpu_add(nmi_stats.unknown, handled);
 		return;
@@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs)
 
 	__this_cpu_write(last_nmi_rip, regs->ip);
 
-	handled = nmi_handle(NMI_LOCAL, regs, b2b);
+	handled = nmi_handle(NMI_LOCAL, regs);
 	__this_cpu_add(nmi_stats.normal, handled);
 	if (handled) {
 		/*
-- 
cgit v1.2.3-70-g09d2


From 5bc016f1abaa1c5ac0e3af23aa79faec4634a074 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Mon, 20 Jul 2015 08:49:01 +0100
Subject: x86/fpu: Disable dependent CPU features on "noxsave"

Complete the set of dependent features that need disabling at
once: XSAVEC, AVX-512 and all currently known to the kernel
extensions to it, as well as MPX need to be disabled too.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/55ACC40D0200007800092E6C@mail.emea.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/init.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 0b39173dd971..1e173f6285c7 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -351,9 +351,15 @@ static int __init x86_noxsave_setup(char *s)
 
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
 	setup_clear_cpu_cap(X86_FEATURE_AVX);
 	setup_clear_cpu_cap(X86_FEATURE_AVX2);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+	setup_clear_cpu_cap(X86_FEATURE_MPX);
 
 	return 1;
 }
-- 
cgit v1.2.3-70-g09d2


From 0233606ce5cf12c1a0e27cb197066ea5bc2bb488 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 19 Jul 2015 21:09:04 -0400
Subject: x86/entry/vm86: Clean up saved_fs/gs

There is no need to save FS and non-lazy GS outside the 32-bit
regs.  Lazy GS still needs to be saved because it wasn't saved
on syscall entry.  Save it in the gs slot of regs32, which is
present but unused.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437354550-25858-2-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 2 --
 arch/x86/kernel/vm86_32.c        | 6 ++----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..f4e4e3ff753c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -410,8 +410,6 @@ struct thread_struct {
 	unsigned long		v86flags;
 	unsigned long		v86mask;
 	unsigned long		saved_sp0;
-	unsigned int		saved_fs;
-	unsigned int		saved_gs;
 #endif
 	/* IO permissions: */
 	unsigned long		*io_bitmap_ptr;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index fc9db6ef2a95..761a2f9039a3 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -159,8 +159,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 
 	ret = KVM86->regs32;
 
-	ret->fs = current->thread.saved_fs;
-	set_user_gs(ret, current->thread.saved_gs);
+	lazy_load_gs(ret->gs);
 
 	return ret;
 }
@@ -315,8 +314,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  */
 	info->regs32->ax = VM86_SIGNAL;
 	tsk->thread.saved_sp0 = tsk->thread.sp0;
-	tsk->thread.saved_fs = info->regs32->fs;
-	tsk->thread.saved_gs = get_user_gs(info->regs32);
+	lazy_save_gs(info->regs32->gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
 	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
-- 
cgit v1.2.3-70-g09d2


From df1ae9a5dc66d9fd57109240042372b1065d984a Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 19 Jul 2015 21:09:05 -0400
Subject: x86/entry/vm86: Preserve 'orig_ax'

There is no legitimate reason for usermode to modify the 'orig_ax'
field on entry to vm86 mode, so copy it from the 32-bit regs.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437354550-25858-3-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/vm86_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 761a2f9039a3..9a2dc80059ab 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -294,6 +294,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
 	info->regs.pt.flags |= X86_VM_MASK;
 
+	info->regs.pt.orig_ax = info->regs32->orig_ax;
+
 	switch (info->cpu_type) {
 	case CPU_286:
 		tsk->thread.v86mask = 0;
-- 
cgit v1.2.3-70-g09d2


From ed0b2edb61ba4e557de759093d965654186f28b2 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 19 Jul 2015 21:09:06 -0400
Subject: x86/entry/vm86: Move userspace accesses to do_sys_vm86()

Move the userspace accesses down into the common function in
preparation for the next set of patches.  Also change to copying
the fields explicitly instead of assuming a fixed order in
pt_regs and the kernel data structures.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437354550-25858-4-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h |   2 +-
 arch/x86/kernel/vm86_32.c        | 189 +++++++++++++++++++++------------------
 2 files changed, 102 insertions(+), 89 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f4e4e3ff753c..35ad5547a417 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -405,7 +405,7 @@ struct thread_struct {
 	unsigned long		error_code;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
-	struct vm86_struct __user *vm86_info;
+	struct vm86plus_struct __user *vm86_info;
 	unsigned long		screen_bitmap;
 	unsigned long		v86flags;
 	unsigned long		v86mask;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9a2dc80059ab..e6c2b47ec261 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -90,46 +90,13 @@
 #define SAFE_MASK	(0xDD5)
 #define RETURN_MASK	(0xDFF)
 
-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
-				  const struct kernel_vm86_regs *regs)
-{
-	int ret = 0;
-
-	/*
-	 * kernel_vm86_regs is missing gs, so copy everything up to
-	 * (but not including) orig_eax, and then rest including orig_eax.
-	 */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
-			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.orig_ax));
-
-	return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
-				    const struct vm86_regs __user *user,
-				    unsigned extra)
-{
-	int ret = 0;
-
-	/* copy ax-fs inclusive */
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	/* copy orig_ax-__gsh+extra */
-	ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
-			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.orig_ax) +
-			      extra);
-	return ret;
-}
-
 struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 {
 	struct tss_struct *tss;
 	struct pt_regs *ret;
-	unsigned long tmp;
+	struct task_struct *tsk = current;
+	struct vm86plus_struct __user *user;
+	long err = 0;
 
 	/*
 	 * This gets called from entry.S with interrupts disabled, but
@@ -138,23 +105,50 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	 */
 	local_irq_enable();
 
-	if (!current->thread.vm86_info) {
+	if (!tsk->thread.vm86_info) {
 		pr_alert("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
-	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
-	tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
-	if (tmp) {
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | tsk->thread.v86mask);
+	user = tsk->thread.vm86_info;
+
+	if (!access_ok(VERIFY_WRITE, user, VMPI.is_vm86pus ?
+		       sizeof(struct vm86plus_struct) :
+		       sizeof(struct vm86_struct))) {
+		pr_alert("could not access userspace vm86_info\n");
+		do_exit(SIGSEGV);
+	}
+
+	put_user_try {
+		put_user_ex(regs->pt.bx, &user->regs.ebx);
+		put_user_ex(regs->pt.cx, &user->regs.ecx);
+		put_user_ex(regs->pt.dx, &user->regs.edx);
+		put_user_ex(regs->pt.si, &user->regs.esi);
+		put_user_ex(regs->pt.di, &user->regs.edi);
+		put_user_ex(regs->pt.bp, &user->regs.ebp);
+		put_user_ex(regs->pt.ax, &user->regs.eax);
+		put_user_ex(regs->pt.ip, &user->regs.eip);
+		put_user_ex(regs->pt.cs, &user->regs.cs);
+		put_user_ex(regs->pt.flags, &user->regs.eflags);
+		put_user_ex(regs->pt.sp, &user->regs.esp);
+		put_user_ex(regs->pt.ss, &user->regs.ss);
+		put_user_ex(regs->es, &user->regs.es);
+		put_user_ex(regs->ds, &user->regs.ds);
+		put_user_ex(regs->fs, &user->regs.fs);
+		put_user_ex(regs->gs, &user->regs.gs);
+
+		put_user_ex(tsk->thread.screen_bitmap, &user->screen_bitmap);
+	} put_user_catch(err);
+	if (err) {
 		pr_alert("could not access userspace vm86_info\n");
 		do_exit(SIGSEGV);
 	}
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	current->thread.sp0 = current->thread.saved_sp0;
-	current->thread.sysenter_cs = __KERNEL_CS;
-	load_sp0(tss, &current->thread);
-	current->thread.saved_sp0 = 0;
+	tsk->thread.sp0 = tsk->thread.saved_sp0;
+	tsk->thread.sysenter_cs = __KERNEL_CS;
+	load_sp0(tss, &tsk->thread);
+	tsk->thread.saved_sp0 = 0;
 	put_cpu();
 
 	ret = KVM86->regs32;
@@ -199,7 +193,8 @@ out:
 
 
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
+			struct kernel_vm86_struct *info);
 
 SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
 {
@@ -208,21 +203,8 @@ SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
 					 * This remains on the stack until we
 					 * return to 32 bit user space.
 					 */
-	struct task_struct *tsk = current;
-	int tmp;
 
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, vm86plus) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
-	info.regs32 = current_pt_regs();
-	tsk->thread.vm86_info = v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) v86, false, &info);
 }
 
 
@@ -233,11 +215,7 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 					 * This remains on the stack until we
 					 * return to 32 bit user space.
 					 */
-	struct task_struct *tsk;
-	int tmp;
-	struct vm86plus_struct __user *v86;
 
-	tsk = current;
 	switch (cmd) {
 	case VM86_REQUEST_IRQ:
 	case VM86_FREE_IRQ:
@@ -255,34 +233,69 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 	}
 
 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	v86 = (struct vm86plus_struct __user *)arg;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, regs32) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	info.regs32 = current_pt_regs();
-	info.vm86plus.is_vm86pus = 1;
-	tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) arg, true, &info);
 }
 
 
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
+			struct kernel_vm86_struct *info)
 {
 	struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
-	info->regs.pt.ds = 0;
-	info->regs.pt.es = 0;
-	info->regs.pt.fs = 0;
-#ifndef CONFIG_X86_32_LAZY_GS
-	info->regs.pt.gs = 0;
-#endif
+	struct task_struct *tsk = current;
+	unsigned long err = 0;
+
+	if (tsk->thread.saved_sp0)
+		return -EPERM;
+
+	if (!access_ok(VERIFY_READ, v86, plus ?
+		       sizeof(struct vm86_struct) :
+		       sizeof(struct vm86plus_struct)))
+		return -EFAULT;
+
+	memset(info, 0, sizeof(*info));
+	get_user_try {
+		unsigned short seg;
+		get_user_ex(info->regs.pt.bx, &v86->regs.ebx);
+		get_user_ex(info->regs.pt.cx, &v86->regs.ecx);
+		get_user_ex(info->regs.pt.dx, &v86->regs.edx);
+		get_user_ex(info->regs.pt.si, &v86->regs.esi);
+		get_user_ex(info->regs.pt.di, &v86->regs.edi);
+		get_user_ex(info->regs.pt.bp, &v86->regs.ebp);
+		get_user_ex(info->regs.pt.ax, &v86->regs.eax);
+		get_user_ex(info->regs.pt.ip, &v86->regs.eip);
+		get_user_ex(seg, &v86->regs.cs);
+		info->regs.pt.cs = seg;
+		get_user_ex(info->regs.pt.flags, &v86->regs.eflags);
+		get_user_ex(info->regs.pt.sp, &v86->regs.esp);
+		get_user_ex(seg, &v86->regs.ss);
+		info->regs.pt.ss = seg;
+		get_user_ex(info->regs.es, &v86->regs.es);
+		get_user_ex(info->regs.ds, &v86->regs.ds);
+		get_user_ex(info->regs.fs, &v86->regs.fs);
+		get_user_ex(info->regs.gs, &v86->regs.gs);
+
+		get_user_ex(info->flags, &v86->flags);
+		get_user_ex(info->screen_bitmap, &v86->screen_bitmap);
+		get_user_ex(info->cpu_type, &v86->cpu_type);
+	} get_user_catch(err);
+	if (err)
+		return err;
+
+	if (copy_from_user(&info->int_revectored, &v86->int_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (copy_from_user(&info->int21_revectored, &v86->int21_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (plus) {
+		if (copy_from_user(&info->vm86plus, &v86->vm86plus,
+				   sizeof(struct vm86plus_info_struct)))
+			return -EFAULT;
+		info->vm86plus.is_vm86pus = 1;
+	}
+
+	info->regs32 = current_pt_regs();
+	tsk->thread.vm86_info = v86;
 
 /*
  * The flags register is also special: we cannot trust that the user
@@ -344,7 +357,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 		"jmp resume_userspace"
 		: /* no outputs */
 		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
-	/* we never return here */
+	unreachable();	/* we never return here */
 }
 
 static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-- 
cgit v1.2.3-70-g09d2


From 949163015ce6fdb76a5e846a3582d3c40c23c001 Mon Sep 17 00:00:00 2001
From: Paolo Pisati <p.pisati@gmail.com>
Date: Mon, 20 Jul 2015 18:23:50 +0200
Subject: x86/boot: Obsolete the MCA sys_desc_table

The kernel does not support the MCA bus anymroe, so mark sys_desc_table
as obsolete: remove any reference from the code together with the remaining
of MCA logic.

bloat-o-meter output:

  add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-55 (-55)
  function                                     old     new   delta
  i386_start_kernel                            128     119      -9
  setup_arch                                  1421    1375     -46

Signed-off-by: Paolo Pisati <p.pisati@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437409430-8491-1-git-send-email-p.pisati@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/zero-page.txt       |  3 ++-
 arch/x86/boot/Makefile                |  2 +-
 arch/x86/boot/boot.h                  |  3 ---
 arch/x86/boot/compressed/eboot.c      |  4 ----
 arch/x86/boot/main.c                  |  3 ---
 arch/x86/boot/mca.c                   | 38 -----------------------------------
 arch/x86/include/asm/processor.h      |  8 --------
 arch/x86/include/uapi/asm/bootparam.h |  2 +-
 arch/x86/kernel/kexec-bzimage64.c     |  3 ---
 arch/x86/kernel/setup.c               |  5 -----
 10 files changed, 4 insertions(+), 67 deletions(-)
 delete mode 100644 arch/x86/boot/mca.c

(limited to 'arch/x86/kernel')

diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index 82fbdbc1e0b0..95a4d34af3fd 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -17,7 +17,8 @@ Offset	Proto	Name		Meaning
 				(struct ist_info)
 080/010	ALL	hd0_info	hd0 disk parameter, OBSOLETE!!
 090/010	ALL	hd1_info	hd1 disk parameter, OBSOLETE!!
-0A0/010	ALL	sys_desc_table	System description table (struct sys_desc_table)
+0A0/010	ALL	sys_desc_table	System description table (struct sys_desc_table),
+				OBSOLETE!!
 0B0/010	ALL	olpc_ofw_header	OLPC's OpenFirmware CIF and friends
 0C0/004	ALL	ext_ramdisk_image ramdisk_image high 32bits
 0C4/004	ALL	ext_ramdisk_size  ramdisk_size high 32bits
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 57bbf2fb21f6..0d553e54171b 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -23,7 +23,7 @@ targets		+= fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-		:= compressed
 
 setup-y		+= a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
-setup-y		+= early_serial_console.o edd.o header.o main.o mca.o memory.o
+setup-y		+= early_serial_console.o edd.o header.o main.o memory.o
 setup-y		+= pm.o pmjump.o printf.o regs.o string.o tty.o video.o
 setup-y		+= video-mode.o version.o
 setup-$(CONFIG_X86_APM_BOOT) += apm.o
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index bd49ec61255c..0033e96c3f09 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -307,9 +307,6 @@ void query_edd(void);
 /* header.S */
 void __attribute__((noreturn)) die(void);
 
-/* mca.c */
-int query_mca(void);
-
 /* memory.c */
 int detect_memory(void);
 
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 2c82bd150d43..e140bccf9cf3 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1041,7 +1041,6 @@ void setup_graphics(struct boot_params *boot_params)
 struct boot_params *make_boot_params(struct efi_config *c)
 {
 	struct boot_params *boot_params;
-	struct sys_desc_table *sdt;
 	struct apm_bios_info *bi;
 	struct setup_header *hdr;
 	struct efi_info *efi;
@@ -1089,7 +1088,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
 	hdr = &boot_params->hdr;
 	efi = &boot_params->efi_info;
 	bi = &boot_params->apm_bios_info;
-	sdt = &boot_params->sys_desc_table;
 
 	/* Copy the second sector to boot_params */
 	memcpy(&hdr->jump, image->image_base + 512, 512);
@@ -1118,8 +1116,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
 	/* Clear APM BIOS info */
 	memset(bi, 0, sizeof(*bi));
 
-	memset(sdt, 0, sizeof(*sdt));
-
 	status = efi_parse_options(cmdline_ptr);
 	if (status != EFI_SUCCESS)
 		goto fail2;
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index fd6c9f236996..9bcea386db65 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -161,9 +161,6 @@ void main(void)
 	/* Set keyboard repeat rate (why?) and query the lock flags */
 	keyboard_init();
 
-	/* Query MCA information */
-	query_mca();
-
 	/* Query Intel SpeedStep (IST) information */
 	query_ist();
 
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
deleted file mode 100644
index a95a531148ef..000000000000
--- a/arch/x86/boot/mca.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright (C) 1991, 1992 Linus Torvalds
- *   Copyright 2007 rPath, Inc. - All Rights Reserved
- *   Copyright 2009 Intel Corporation; author H. Peter Anvin
- *
- *   This file is part of the Linux kernel, and is made available under
- *   the terms of the GNU General Public License version 2.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * Get the MCA system description table
- */
-
-#include "boot.h"
-
-int query_mca(void)
-{
-	struct biosregs ireg, oreg;
-	u16 len;
-
-	initregs(&ireg);
-	ireg.ah = 0xc0;
-	intcall(0x15, &ireg, &oreg);
-
-	if (oreg.eflags & X86_EFLAGS_CF)
-		return -1;	/* No MCA present */
-
-	set_fs(oreg.es);
-	len = rdfs16(oreg.bx);
-
-	if (len > sizeof(boot_params.sys_desc_table))
-		len = sizeof(boot_params.sys_desc_table);
-
-	copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
-	return 0;
-}
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..3e15e1358f21 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -647,14 +647,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 
 extern void set_task_blockstep(struct task_struct *task, bool on);
 
-/*
- * from system description table in BIOS. Mostly for MCA use, but
- * others may find it useful:
- */
-extern unsigned int		machine_id;
-extern unsigned int		machine_submodel_id;
-extern unsigned int		BIOS_revision;
-
 /* Boot loader type from the setup header: */
 extern int			bootloader_type;
 extern int			bootloader_version;
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ab456dc233b5..329254373479 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -120,7 +120,7 @@ struct boot_params {
 	__u8  _pad3[16];				/* 0x070 */
 	__u8  hd0_info[16];	/* obsolete! */		/* 0x080 */
 	__u8  hd1_info[16];	/* obsolete! */		/* 0x090 */
-	struct sys_desc_table sys_desc_table;		/* 0x0a0 */
+	struct sys_desc_table sys_desc_table; /* obsolete! */	/* 0x0a0 */
 	struct olpc_ofw_header olpc_ofw_header;		/* 0x0b0 */
 	__u32 ext_ramdisk_image;			/* 0x0c0 */
 	__u32 ext_ramdisk_size;				/* 0x0c4 */
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ca83f7ac388b..961e51e9c6f6 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -223,9 +223,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
 	memset(&params->hd0_info, 0, sizeof(params->hd0_info));
 	memset(&params->hd1_info, 0, sizeof(params->hd1_info));
 
-	/* Default sysdesc table */
-	params->sys_desc_table.length = 0;
-
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = crash_setup_memmap_entries(image, params);
 		if (ret)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80f874bf999e..b143c2d04420 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -916,11 +916,6 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_32
 	apm_info.bios = boot_params.apm_bios_info;
 	ist_info = boot_params.ist_info;
-	if (boot_params.sys_desc_table.length != 0) {
-		machine_id = boot_params.sys_desc_table.table[0];
-		machine_submodel_id = boot_params.sys_desc_table.table[1];
-		BIOS_revision = boot_params.sys_desc_table.table[2];
-	}
 #endif
 	saved_video_mode = boot_params.hdr.vid_mode;
 	bootloader_type = boot_params.hdr.type_of_loader;
-- 
cgit v1.2.3-70-g09d2


From f78f5b90c4ffa559e400c3919a02236101f29f3f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Jun 2015 15:50:02 -0700
Subject: rcu: Rename rcu_lockdep_assert() to RCU_LOCKDEP_WARN()

This commit renames rcu_lockdep_assert() to RCU_LOCKDEP_WARN() for
consistency with the WARN() series of macros.  This also requires
inverting the sense of the conditional, which this commit also does.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/RCU/whatisRCU.txt  |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c |  6 ++--
 arch/x86/kernel/traps.c          |  2 +-
 drivers/base/power/opp.c         |  4 +--
 include/linux/fdtable.h          |  4 +--
 include/linux/rcupdate.h         | 63 ++++++++++++++++++++++++++--------------
 kernel/cgroup.c                  |  4 +--
 kernel/pid.c                     |  5 ++--
 kernel/rcu/srcu.c                | 10 +++----
 kernel/rcu/tiny.c                |  8 ++---
 kernel/rcu/tree.c                | 28 +++++++++---------
 kernel/rcu/tree_plugin.h         |  8 ++---
 kernel/rcu/update.c              |  4 +--
 kernel/sched/core.c              |  8 ++---
 kernel/workqueue.c               | 20 ++++++-------
 security/device_cgroup.c         |  6 ++--
 16 files changed, 101 insertions(+), 81 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 5746b0c77f3e..adc2184009c5 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -883,7 +883,7 @@ All:  lockdep-checked RCU-protected pointer access
 
 	rcu_access_pointer
 	rcu_dereference_raw
-	rcu_lockdep_assert
+	RCU_LOCKDEP_WARN
 	rcu_sleep_check
 	RCU_NONIDLE
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index df919ff103c3..3d6b5269fb2e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -54,9 +54,9 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
 #define rcu_dereference_check_mce(p) \
 ({ \
-	rcu_lockdep_assert(rcu_read_lock_sched_held() || \
-			   lockdep_is_held(&mce_chrdev_read_mutex), \
-			   "suspicious rcu_dereference_check_mce() usage"); \
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+			 !lockdep_is_held(&mce_chrdev_read_mutex), \
+			 "suspicious rcu_dereference_check_mce() usage"); \
 	smp_load_acquire(&(p)); \
 })
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f5791927aa64..c5a5231d1d11 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -136,7 +136,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 	preempt_count_add(HARDIRQ_OFFSET);
 
 	/* This code is a bit fragile.  Test it. */
-	rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
 
 	return prev_state;
 }
diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index 677fb2843553..3b188f20b43f 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -110,8 +110,8 @@ static DEFINE_MUTEX(dev_opp_list_lock);
 
 #define opp_rcu_lockdep_assert()					\
 do {									\
-	rcu_lockdep_assert(rcu_read_lock_held() ||			\
-				lockdep_is_held(&dev_opp_list_lock),	\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
+				!lockdep_is_held(&dev_opp_list_lock),	\
 			   "Missing rcu_read_lock() or "		\
 			   "dev_opp_list_lock protection");		\
 } while (0)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index fbb88740634a..674e3e226465 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -86,8 +86,8 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
 
 static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
 {
-	rcu_lockdep_assert(rcu_read_lock_held() ||
-			   lockdep_is_held(&files->file_lock),
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
+			   !lockdep_is_held(&files->file_lock),
 			   "suspicious rcu_dereference_check() usage");
 	return __fcheck_files(files, fd);
 }
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 33ec16b9c2ee..ff476515f716 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -536,6 +536,11 @@ static inline int rcu_read_lock_sched_held(void)
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+/* Deprecate rcu_lockdep_assert():  Use RCU_LOCKDEP_WARN() instead. */
+static inline void __attribute((deprecated)) deprecate_rcu_lockdep_assert(void)
+{
+}
+
 #ifdef CONFIG_PROVE_RCU
 
 /**
@@ -546,17 +551,32 @@ static inline int rcu_read_lock_sched_held(void)
 #define rcu_lockdep_assert(c, s)					\
 	do {								\
 		static bool __section(.data.unlikely) __warned;		\
+		deprecate_rcu_lockdep_assert();				\
 		if (debug_lockdep_rcu_enabled() && !__warned && !(c)) {	\
 			__warned = true;				\
 			lockdep_rcu_suspicious(__FILE__, __LINE__, s);	\
 		}							\
 	} while (0)
 
+/**
+ * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
+ * @c: condition to check
+ * @s: informative message
+ */
+#define RCU_LOCKDEP_WARN(c, s)						\
+	do {								\
+		static bool __section(.data.unlikely) __warned;		\
+		if (debug_lockdep_rcu_enabled() && !__warned && (c)) {	\
+			__warned = true;				\
+			lockdep_rcu_suspicious(__FILE__, __LINE__, s);	\
+		}							\
+	} while (0)
+
 #if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU)
 static inline void rcu_preempt_sleep_check(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
-			   "Illegal context switch in RCU read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
+			 "Illegal context switch in RCU read-side critical section");
 }
 #else /* #ifdef CONFIG_PROVE_RCU */
 static inline void rcu_preempt_sleep_check(void)
@@ -567,15 +587,16 @@ static inline void rcu_preempt_sleep_check(void)
 #define rcu_sleep_check()						\
 	do {								\
 		rcu_preempt_sleep_check();				\
-		rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),	\
-				   "Illegal context switch in RCU-bh read-side critical section"); \
-		rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),	\
-				   "Illegal context switch in RCU-sched read-side critical section"); \
+		RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),	\
+				 "Illegal context switch in RCU-bh read-side critical section"); \
+		RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),	\
+				 "Illegal context switch in RCU-sched read-side critical section"); \
 	} while (0)
 
 #else /* #ifdef CONFIG_PROVE_RCU */
 
-#define rcu_lockdep_assert(c, s) do { } while (0)
+#define rcu_lockdep_assert(c, s) deprecate_rcu_lockdep_assert()
+#define RCU_LOCKDEP_WARN(c, s) do { } while (0)
 #define rcu_sleep_check() do { } while (0)
 
 #endif /* #else #ifdef CONFIG_PROVE_RCU */
@@ -606,13 +627,13 @@ static inline void rcu_preempt_sleep_check(void)
 ({ \
 	/* Dependency order vs. p above. */ \
 	typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \
-	rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \
+	RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
 	rcu_dereference_sparse(p, space); \
 	((typeof(*p) __force __kernel *)(________p1)); \
 })
 #define __rcu_dereference_protected(p, c, space) \
 ({ \
-	rcu_lockdep_assert(c, "suspicious rcu_dereference_protected() usage"); \
+	RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
 	rcu_dereference_sparse(p, space); \
 	((typeof(*p) __force __kernel *)(p)); \
 })
@@ -836,8 +857,8 @@ static inline void rcu_read_lock(void)
 	__rcu_read_lock();
 	__acquire(RCU);
 	rcu_lock_acquire(&rcu_lock_map);
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_lock() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_lock() used illegally while idle");
 }
 
 /*
@@ -887,8 +908,8 @@ static inline void rcu_read_lock(void)
  */
 static inline void rcu_read_unlock(void)
 {
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_unlock() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_unlock() used illegally while idle");
 	__release(RCU);
 	__rcu_read_unlock();
 	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
@@ -916,8 +937,8 @@ static inline void rcu_read_lock_bh(void)
 	local_bh_disable();
 	__acquire(RCU_BH);
 	rcu_lock_acquire(&rcu_bh_lock_map);
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_lock_bh() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_lock_bh() used illegally while idle");
 }
 
 /*
@@ -927,8 +948,8 @@ static inline void rcu_read_lock_bh(void)
  */
 static inline void rcu_read_unlock_bh(void)
 {
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_unlock_bh() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_unlock_bh() used illegally while idle");
 	rcu_lock_release(&rcu_bh_lock_map);
 	__release(RCU_BH);
 	local_bh_enable();
@@ -952,8 +973,8 @@ static inline void rcu_read_lock_sched(void)
 	preempt_disable();
 	__acquire(RCU_SCHED);
 	rcu_lock_acquire(&rcu_sched_lock_map);
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_lock_sched() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_lock_sched() used illegally while idle");
 }
 
 /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
@@ -970,8 +991,8 @@ static inline notrace void rcu_read_lock_sched_notrace(void)
  */
 static inline void rcu_read_unlock_sched(void)
 {
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_unlock_sched() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_unlock_sched() used illegally while idle");
 	rcu_lock_release(&rcu_sched_lock_map);
 	__release(RCU_SCHED);
 	preempt_enable();
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..b89f3168411b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock);
 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 
 #define cgroup_assert_mutex_or_rcu_locked()				\
-	rcu_lockdep_assert(rcu_read_lock_held() ||			\
-			   lockdep_is_held(&cgroup_mutex),		\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
+			   !lockdep_is_held(&cgroup_mutex),		\
 			   "cgroup_mutex or RCU read lock required");
 
 /*
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fd07d5b7baf..ca368793808e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task);
  */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-	rcu_lockdep_assert(rcu_read_lock_held(),
-			   "find_task_by_pid_ns() needs rcu_read_lock()"
-			   " protection");
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+			 "find_task_by_pid_ns() needs rcu_read_lock() protection");
 	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
 
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index de35087c92a5..d3fcb2ec8536 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -415,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	struct rcu_head *head = &rcu.head;
 	bool done = false;
 
-	rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
-			   !lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+			 lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
 
 	might_sleep();
 	init_completion(&rcu.completion);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c291bd65d2cb..d0471056d0af 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  */
 void synchronize_sched(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_sched() in RCU read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_sched() in RCU read-side critical section");
 	cond_resched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb64d7e13d24..0a73d26357a2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -649,12 +649,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 	 * It is illegal to enter an extended quiescent state while
 	 * in an RCU read-side critical section.
 	 */
-	rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
-			   "Illegal idle entry in RCU read-side critical section.");
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
-			   "Illegal idle entry in RCU-bh read-side critical section.");
-	rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
-			   "Illegal idle entry in RCU-sched read-side critical section.");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
+			 "Illegal idle entry in RCU read-side critical section.");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
+			 "Illegal idle entry in RCU-bh read-side critical section.");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
+			 "Illegal idle entry in RCU-sched read-side critical section.");
 }
 
 /*
@@ -3161,10 +3161,10 @@ static inline int rcu_blocking_is_gp(void)
  */
 void synchronize_sched(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_sched() in RCU-sched read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_sched() in RCU-sched read-side critical section");
 	if (rcu_blocking_is_gp())
 		return;
 	if (rcu_gp_is_expedited())
@@ -3188,10 +3188,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
  */
 void synchronize_rcu_bh(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
 	if (rcu_blocking_is_gp())
 		return;
 	if (rcu_gp_is_expedited())
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a983bc68a146..9e922f111d63 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -538,10 +538,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
  */
 void synchronize_rcu(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_rcu() in RCU read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_rcu() in RCU read-side critical section");
 	if (!rcu_scheduler_active)
 		return;
 	if (rcu_gp_is_expedited())
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a0a0dd03c73a..47268fb1d27b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -589,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
 void synchronize_rcu_tasks(void)
 {
 	/* Complain if the scheduler has not started.  */
-	rcu_lockdep_assert(!rcu_scheduler_active,
-			   "synchronize_rcu_tasks called too soon");
+	RCU_LOCKDEP_WARN(rcu_scheduler_active,
+			 "synchronize_rcu_tasks called too soon");
 
 	/* Wait for the grace period. */
 	wait_rcu_gp(call_rcu_tasks);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..5e73c79fadd0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2200,8 +2200,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_SMP
 inline struct dl_bw *dl_bw_of(int i)
 {
-	rcu_lockdep_assert(rcu_read_lock_sched_held(),
-			   "sched RCU must be held");
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+			 "sched RCU must be held");
 	return &cpu_rq(i)->rd->dl_bw;
 }
 
@@ -2210,8 +2210,8 @@ static inline int dl_bw_cpus(int i)
 	struct root_domain *rd = cpu_rq(i)->rd;
 	int cpus = 0;
 
-	rcu_lockdep_assert(rcu_read_lock_sched_held(),
-			   "sched RCU must be held");
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+			 "sched RCU must be held");
 	for_each_cpu_and(i, rd->span, cpu_active_mask)
 		cpus++;
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c4f06176f74..cb91c63b4f4a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 #include <trace/events/workqueue.h>
 
 #define assert_rcu_or_pool_mutex()					\
-	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
-			   lockdep_is_held(&wq_pool_mutex),		\
-			   "sched RCU or wq_pool_mutex should be held")
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+			 !lockdep_is_held(&wq_pool_mutex),		\
+			 "sched RCU or wq_pool_mutex should be held")
 
 #define assert_rcu_or_wq_mutex(wq)					\
-	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
-			   lockdep_is_held(&wq->mutex),			\
-			   "sched RCU or wq->mutex should be held")
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+			 !lockdep_is_held(&wq->mutex),			\
+			 "sched RCU or wq->mutex should be held")
 
 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
-	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
-			   lockdep_is_held(&wq->mutex) ||		\
-			   lockdep_is_held(&wq_pool_mutex),		\
-			   "sched RCU, wq->mutex or wq_pool_mutex should be held")
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+			 !lockdep_is_held(&wq->mutex) &&		\
+			 !lockdep_is_held(&wq_pool_mutex),		\
+			 "sched RCU, wq->mutex or wq_pool_mutex should be held")
 
 #define for_each_cpu_worker_pool(pool, cpu)				\
 	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 188c1d26393b..73455089feef 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -400,9 +400,9 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
 {
 	bool match = false;
 
-	rcu_lockdep_assert(rcu_read_lock_held() ||
-			   lockdep_is_held(&devcgroup_mutex),
-			   "device_cgroup:verify_new_ex called without proper synchronization");
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
+			 lockdep_is_held(&devcgroup_mutex),
+			 "device_cgroup:verify_new_ex called without proper synchronization");
 
 	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
 		if (behavior == DEVCG_DEFAULT_ALLOW) {
-- 
cgit v1.2.3-70-g09d2


From 2c534c0da0a68418693e10ce1c4146e085f39518 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 21 Jul 2015 15:55:09 +0100
Subject: perf/x86/intel/cqm: Return cached counter value from IRQ context

Peter reported the following potential crash which I was able to
reproduce with his test program,

[  148.765788] ------------[ cut here ]------------
[  148.765796] WARNING: CPU: 34 PID: 2840 at kernel/smp.c:417 smp_call_function_many+0xb6/0x260()
[  148.765797] Modules linked in:
[  148.765800] CPU: 34 PID: 2840 Comm: perf Not tainted 4.2.0-rc1+ #4
[  148.765803]  ffffffff81cdc398 ffff88085f105950 ffffffff818bdfd5 0000000000000007
[  148.765805]  0000000000000000 ffff88085f105990 ffffffff810e413a 0000000000000000
[  148.765807]  ffffffff82301080 0000000000000022 ffffffff8107f640 ffffffff8107f640
[  148.765809] Call Trace:
[  148.765810]  <NMI>  [<ffffffff818bdfd5>] dump_stack+0x45/0x57
[  148.765818]  [<ffffffff810e413a>] warn_slowpath_common+0x8a/0xc0
[  148.765822]  [<ffffffff8107f640>] ? intel_cqm_stable+0x60/0x60
[  148.765824]  [<ffffffff8107f640>] ? intel_cqm_stable+0x60/0x60
[  148.765825]  [<ffffffff810e422a>] warn_slowpath_null+0x1a/0x20
[  148.765827]  [<ffffffff811613f6>] smp_call_function_many+0xb6/0x260
[  148.765829]  [<ffffffff8107f640>] ? intel_cqm_stable+0x60/0x60
[  148.765831]  [<ffffffff81161748>] on_each_cpu_mask+0x28/0x60
[  148.765832]  [<ffffffff8107f6ef>] intel_cqm_event_count+0x7f/0xe0
[  148.765836]  [<ffffffff811cdd35>] perf_output_read+0x2a5/0x400
[  148.765839]  [<ffffffff811d2e5a>] perf_output_sample+0x31a/0x590
[  148.765840]  [<ffffffff811d333d>] ? perf_prepare_sample+0x26d/0x380
[  148.765841]  [<ffffffff811d3497>] perf_event_output+0x47/0x60
[  148.765843]  [<ffffffff811d36c5>] __perf_event_overflow+0x215/0x240
[  148.765844]  [<ffffffff811d4124>] perf_event_overflow+0x14/0x20
[  148.765847]  [<ffffffff8107e7f4>] intel_pmu_handle_irq+0x1d4/0x440
[  148.765849]  [<ffffffff811d07a6>] ? __perf_event_task_sched_in+0x36/0xa0
[  148.765853]  [<ffffffff81219bad>] ? vunmap_page_range+0x19d/0x2f0
[  148.765854]  [<ffffffff81219d11>] ? unmap_kernel_range_noflush+0x11/0x20
[  148.765859]  [<ffffffff814ce6fe>] ? ghes_copy_tofrom_phys+0x11e/0x2a0
[  148.765863]  [<ffffffff8109e5db>] ? native_apic_msr_write+0x2b/0x30
[  148.765865]  [<ffffffff8109e44d>] ? x2apic_send_IPI_self+0x1d/0x20
[  148.765869]  [<ffffffff81065135>] ? arch_irq_work_raise+0x35/0x40
[  148.765872]  [<ffffffff811c8d86>] ? irq_work_queue+0x66/0x80
[  148.765875]  [<ffffffff81075306>] perf_event_nmi_handler+0x26/0x40
[  148.765877]  [<ffffffff81063ed9>] nmi_handle+0x79/0x100
[  148.765879]  [<ffffffff81064422>] default_do_nmi+0x42/0x100
[  148.765880]  [<ffffffff81064563>] do_nmi+0x83/0xb0
[  148.765884]  [<ffffffff818c7c0f>] end_repeat_nmi+0x1e/0x2e
[  148.765886]  [<ffffffff811d07a6>] ? __perf_event_task_sched_in+0x36/0xa0
[  148.765888]  [<ffffffff811d07a6>] ? __perf_event_task_sched_in+0x36/0xa0
[  148.765890]  [<ffffffff811d07a6>] ? __perf_event_task_sched_in+0x36/0xa0
[  148.765891]  <<EOE>>  [<ffffffff8110ab66>] finish_task_switch+0x156/0x210
[  148.765898]  [<ffffffff818c1671>] __schedule+0x341/0x920
[  148.765899]  [<ffffffff818c1c87>] schedule+0x37/0x80
[  148.765903]  [<ffffffff810ae1af>] ? do_page_fault+0x2f/0x80
[  148.765905]  [<ffffffff818c1f4a>] schedule_user+0x1a/0x50
[  148.765907]  [<ffffffff818c666c>] retint_careful+0x14/0x32
[  148.765908] ---[ end trace e33ff2be78e14901 ]---

The CQM task events are not safe to be called from within interrupt
context because they require performing an IPI to read the counter value
on all sockets. And performing IPIs from within IRQ context is a
"no-no".

Make do with the last read counter value currently event in
event->count when we're invoked in this context.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikas Shivappa <vikas.shivappa@intel.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Will Auld <will.auld@intel.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/1437490509-15373-1-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 188076161c1b..63eb68b73589 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -951,6 +951,14 @@ static u64 intel_cqm_event_count(struct perf_event *event)
 	if (!cqm_group_leader(event))
 		return 0;
 
+	/*
+	 * Getting up-to-date values requires an SMP IPI which is not
+	 * possible if we're being called in interrupt context. Return
+	 * the cached values instead.
+	 */
+	if (unlikely(in_interrupt()))
+		goto out;
+
 	/*
 	 * Notice that we don't perform the reading of an RMID
 	 * atomically, because we can't hold a spin lock across the
-- 
cgit v1.2.3-70-g09d2


From 031a7f456adecaff692bc040b31a7d2262b4ee56 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 27 Oct 2014 17:16:04 +0100
Subject: apm32: Fix cputime == jiffies assumption

That code wrongly assumes that cputime_t wraps jiffies_t. Lets use
the correct accessors/mutators.

No real harm now as that code can't be used with full dynticks.

Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc; John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/apm_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 927ec9235947..052c9c3026cc 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -919,7 +919,7 @@ recalc:
 	} else if (jiffies_since_last_check > idle_period) {
 		unsigned int idle_percentage;
 
-		idle_percentage = stime - last_stime;
+		idle_percentage = cputime_to_jiffies(stime - last_stime);
 		idle_percentage *= 100;
 		idle_percentage /= jiffies_since_last_check;
 		use_apm_idle = (idle_percentage > idle_threshold);
-- 
cgit v1.2.3-70-g09d2


From b23d8e527815954768861bb20d2b224009fff7cd Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 16 Jul 2015 16:28:44 +0530
Subject: x86/apic: Migrate apic timer to new set_state interface

Migrate apic driver to the new 'set-state' interface provided by
clockevents core, the earlier 'set-mode' interface is marked obsolete
now.

This also enables us to implement callbacks for new states of clockevent
devices, for example: ONESHOT_STOPPED.

We weren't doing anything while switching to resume mode and so that
callback isn't implemented.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Bandan Das <bsd@redhat.com>
Link: http://lkml.kernel.org/r/1896ac5989d27f2ac37f4786af9bd537e1921b83.1437042675.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/apic.c | 86 +++++++++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dcb52850a28f..ecd6705c9f4b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -462,40 +462,53 @@ static int lapic_next_deadline(unsigned long delta,
 	return 0;
 }
 
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
-			      struct clock_event_device *evt)
+static int lapic_timer_shutdown(struct clock_event_device *evt)
 {
 	unsigned long flags;
 	unsigned int v;
 
 	/* Lapic used as dummy for broadcast ? */
 	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
-		return;
+		return 0;
 
 	local_irq_save(flags);
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-	case CLOCK_EVT_MODE_ONESHOT:
-		__setup_APIC_LVTT(lapic_timer_frequency,
-				  mode != CLOCK_EVT_MODE_PERIODIC, 1);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		v = apic_read(APIC_LVTT);
-		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-		apic_write(APIC_LVTT, v);
-		apic_write(APIC_TMICT, 0);
-		break;
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here */
-		break;
-	}
+	v = apic_read(APIC_LVTT);
+	v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+	apic_write(APIC_LVTT, v);
+	apic_write(APIC_TMICT, 0);
+
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+	unsigned long flags;
+
+	/* Lapic used as dummy for broadcast ? */
+	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;
+
+	local_irq_save(flags);
+
+	__setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
 
 	local_irq_restore(flags);
+
+	return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+	return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+	return lapic_timer_set_periodic_oneshot(evt, true);
 }
 
 /*
@@ -513,15 +526,18 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
  * The local apic timer can be used for any function which is CPU local.
  */
 static struct clock_event_device lapic_clockevent = {
-	.name		= "lapic",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
-			| CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
-	.shift		= 32,
-	.set_mode	= lapic_timer_setup,
-	.set_next_event	= lapic_next_event,
-	.broadcast	= lapic_timer_broadcast,
-	.rating		= 100,
-	.irq		= -1,
+	.name			= "lapic",
+	.features		= CLOCK_EVT_FEAT_PERIODIC |
+				  CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+				  | CLOCK_EVT_FEAT_DUMMY,
+	.shift			= 32,
+	.set_state_shutdown	= lapic_timer_shutdown,
+	.set_state_periodic	= lapic_timer_set_periodic,
+	.set_state_oneshot	= lapic_timer_set_oneshot,
+	.set_next_event		= lapic_next_event,
+	.broadcast		= lapic_timer_broadcast,
+	.rating			= 100,
+	.irq			= -1,
 };
 static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 
@@ -778,7 +794,7 @@ static int __init calibrate_APIC_clock(void)
 		 * Setup the apic timer manually
 		 */
 		levt->event_handler = lapic_cal_handler;
-		lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+		lapic_timer_set_periodic(levt);
 		lapic_cal_loops = -1;
 
 		/* Let the interrupts run */
@@ -788,7 +804,7 @@ static int __init calibrate_APIC_clock(void)
 			cpu_relax();
 
 		/* Stop the lapic timer */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+		lapic_timer_shutdown(levt);
 
 		/* Jiffies delta */
 		deltaj = lapic_cal_j2 - lapic_cal_j1;
@@ -878,7 +894,7 @@ static void local_apic_timer_interrupt(void)
 	if (!evt->event_handler) {
 		pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
 		/* Switch it off */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+		lapic_timer_shutdown(evt);
 		return;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From c948c26048ecb1023d2e68222c736f7da41da498 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 30 Jul 2015 00:30:51 +0200
Subject: x86/apic: Drop local_irq_save/restore in timer callbacks

These callbacks are called with interrupts disabled from the core
code. Fixup the local caller to disable interrupts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
---
 arch/x86/kernel/apic/apic.c | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ecd6705c9f4b..1dceb2732425 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -464,40 +464,27 @@ static int lapic_next_deadline(unsigned long delta,
 
 static int lapic_timer_shutdown(struct clock_event_device *evt)
 {
-	unsigned long flags;
 	unsigned int v;
 
 	/* Lapic used as dummy for broadcast ? */
 	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
 		return 0;
 
-	local_irq_save(flags);
-
 	v = apic_read(APIC_LVTT);
 	v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
 	apic_write(APIC_LVTT, v);
 	apic_write(APIC_TMICT, 0);
-
-	local_irq_restore(flags);
-
 	return 0;
 }
 
 static inline int
 lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
 {
-	unsigned long flags;
-
 	/* Lapic used as dummy for broadcast ? */
 	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
 		return 0;
 
-	local_irq_save(flags);
-
 	__setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
-
-	local_irq_restore(flags);
-
 	return 0;
 }
 
@@ -804,6 +791,7 @@ static int __init calibrate_APIC_clock(void)
 			cpu_relax();
 
 		/* Stop the lapic timer */
+		local_irq_disable();
 		lapic_timer_shutdown(levt);
 
 		/* Jiffies delta */
@@ -815,8 +803,8 @@ static int __init calibrate_APIC_clock(void)
 			apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
 		else
 			levt->features |= CLOCK_EVT_FEAT_DUMMY;
-	} else
-		local_irq_enable();
+	}
+	local_irq_enable();
 
 	if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
 		pr_warning("APIC timer disabled due to verification failure\n");
-- 
cgit v1.2.3-70-g09d2


From 646c4b75494747887f936513b669bb8a2d794459 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 30 Jul 2015 15:51:32 +0800
Subject: x86/irq: Use the caller provided polarity setting in
 mp_check_pin_attr()

Commit d32932d02e18 ("x86/irq: Convert IOAPIC to use hierarchical
irqdomain interfaces") introduced a regression which causes
malfunction of interrupt lines.

The reason is that the conversion of mp_check_pin_attr() missed to
update the polarity selection of the interrupt pin with the caller
provided setting and instead uses a stale attribute value. That in
turn results in chosing the wrong interrupt flow handler.

Use the caller supplied setting to configure the pin correctly which
also choses the correct interrupt flow handler.

This restores the original behaviour and on the affected
machine/driver (Surface Pro 3, i2c controller) all IOAPIC IRQ
configuration are identical to v4.1.

Fixes: d32932d02e18 ("x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces")
Reported-and-tested-by: Matt Fleming <matt@codeblueprint.co.uk>
Reported-and-tested-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Chen Yu <yu.c.chen@intel.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1438242695-23531-1-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/io_apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 845dc0df2002..206052e55517 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -943,7 +943,7 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
 	 */
 	if (irq < nr_legacy_irqs() && data->count == 1) {
 		if (info->ioapic_trigger != data->trigger)
-			mp_register_handler(irq, data->trigger);
+			mp_register_handler(irq, info->ioapic_trigger);
 		data->entry.trigger = data->trigger = info->ioapic_trigger;
 		data->entry.polarity = data->polarity = info->ioapic_polarity;
 	}
-- 
cgit v1.2.3-70-g09d2


From c8b5db7de66b75330a96f9f1ad7376b89646c953 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 16 Jul 2015 16:28:45 +0530
Subject: x86/hpet: Migrate to new set_state interface

Migrate hpet driver to the new 'set-state' interface provided by
clockevents core, the earlier 'set-mode' interface is marked obsolete
now.

This also enables us to implement callbacks for new states of clockevent
devices, for example: ONESHOT_STOPPED.

Forward definition of 'hpet_clockevent' wasn't required and so it is
placed after all the callback are defined, to avoid forward declaring
all the callbacks.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Link: http://lkml.kernel.org/r/8cc9864b6d6342dfac28f270cf69f4cba46fffae.1437042675.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 198 +++++++++++++++++++++++++++++++------------------
 1 file changed, 124 insertions(+), 74 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 10757d0a3fcf..437297ac96d0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -226,22 +226,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
  */
 static unsigned long hpet_freq;
 
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt);
-static int hpet_legacy_next_event(unsigned long delta,
-			   struct clock_event_device *evt);
-
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
-	.name		= "hpet",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= hpet_legacy_set_mode,
-	.set_next_event = hpet_legacy_next_event,
-	.irq		= 0,
-	.rating		= 50,
-};
+static struct clock_event_device hpet_clockevent;
 
 static void hpet_stop_counter(void)
 {
@@ -306,64 +291,74 @@ static void hpet_legacy_clockevent_register(void)
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }
 
-static void hpet_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt, int timer)
+static int hpet_set_periodic(struct clock_event_device *evt, int timer)
 {
 	unsigned int cfg, cmp, now;
 	uint64_t delta;
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		hpet_stop_counter();
-		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
-		delta >>= evt->shift;
-		now = hpet_readl(HPET_COUNTER);
-		cmp = now + (unsigned int) delta;
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
-		       HPET_TN_SETVAL | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		hpet_writel(cmp, HPET_Tn_CMP(timer));
-		udelay(1);
-		/*
-		 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
-		 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
-		 * bit is automatically cleared after the first write.
-		 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
-		 * Publication # 24674)
-		 */
-		hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
-		hpet_start_counter();
-		hpet_print_config();
-		break;
+	hpet_stop_counter();
+	delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+	delta >>= evt->shift;
+	now = hpet_readl(HPET_COUNTER);
+	cmp = now + (unsigned int)delta;
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+	       HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+	hpet_writel(cmp, HPET_Tn_CMP(timer));
+	udelay(1);
+	/*
+	 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+	 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+	 * bit is automatically cleared after the first write.
+	 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+	 * Publication # 24674)
+	 */
+	hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+	hpet_start_counter();
+	hpet_print_config();
 
-	case CLOCK_EVT_MODE_ONESHOT:
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg &= ~HPET_TN_PERIODIC;
-		cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		break;
+	return 0;
+}
 
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg &= ~HPET_TN_ENABLE;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		break;
+static int hpet_set_oneshot(struct clock_event_device *evt, int timer)
+{
+	unsigned int cfg;
 
-	case CLOCK_EVT_MODE_RESUME:
-		if (timer == 0) {
-			hpet_enable_legacy_int();
-		} else {
-			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-			irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
-			disable_irq(hdev->irq);
-			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
-			enable_irq(hdev->irq);
-		}
-		hpet_print_config();
-		break;
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg &= ~HPET_TN_PERIODIC;
+	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+	return 0;
+}
+
+static int hpet_shutdown(struct clock_event_device *evt, int timer)
+{
+	unsigned int cfg;
+
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg &= ~HPET_TN_ENABLE;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+	return 0;
+}
+
+static int hpet_resume(struct clock_event_device *evt, int timer)
+{
+	if (!timer) {
+		hpet_enable_legacy_int();
+	} else {
+		struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+		irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
+		disable_irq(hdev->irq);
+		irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
+		enable_irq(hdev->irq);
 	}
+	hpet_print_config();
+
+	return 0;
 }
 
 static int hpet_next_event(unsigned long delta,
@@ -403,10 +398,24 @@ static int hpet_next_event(unsigned long delta,
 	return res < HPET_MIN_CYCLES ? -ETIME : 0;
 }
 
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
-			struct clock_event_device *evt)
+static int hpet_legacy_shutdown(struct clock_event_device *evt)
+{
+	return hpet_shutdown(evt, 0);
+}
+
+static int hpet_legacy_set_oneshot(struct clock_event_device *evt)
+{
+	return hpet_set_oneshot(evt, 0);
+}
+
+static int hpet_legacy_set_periodic(struct clock_event_device *evt)
 {
-	hpet_set_mode(mode, evt, 0);
+	return hpet_set_periodic(evt, 0);
+}
+
+static int hpet_legacy_resume(struct clock_event_device *evt)
+{
+	return hpet_resume(evt, 0);
 }
 
 static int hpet_legacy_next_event(unsigned long delta,
@@ -415,6 +424,22 @@ static int hpet_legacy_next_event(unsigned long delta,
 	return hpet_next_event(delta, evt, 0);
 }
 
+/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+	.name			= "hpet",
+	.features		= CLOCK_EVT_FEAT_PERIODIC |
+				  CLOCK_EVT_FEAT_ONESHOT,
+	.set_state_periodic	= hpet_legacy_set_periodic,
+	.set_state_oneshot	= hpet_legacy_set_oneshot,
+	.set_state_shutdown	= hpet_legacy_shutdown,
+	.tick_resume		= hpet_legacy_resume,
+	.set_next_event		= hpet_legacy_next_event,
+	.irq			= 0,
+	.rating			= 50,
+};
+
 /*
  * HPET MSI Support
  */
@@ -459,11 +484,32 @@ void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
 	msg->address_hi = 0;
 }
 
-static void hpet_msi_set_mode(enum clock_event_mode mode,
-				struct clock_event_device *evt)
+static int hpet_msi_shutdown(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+	return hpet_shutdown(evt, hdev->num);
+}
+
+static int hpet_msi_set_oneshot(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+	return hpet_set_oneshot(evt, hdev->num);
+}
+
+static int hpet_msi_set_periodic(struct clock_event_device *evt)
 {
 	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-	hpet_set_mode(mode, evt, hdev->num);
+
+	return hpet_set_periodic(evt, hdev->num);
+}
+
+static int hpet_msi_resume(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+	return hpet_resume(evt, hdev->num);
 }
 
 static int hpet_msi_next_event(unsigned long delta,
@@ -523,10 +569,14 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 
 	evt->rating = 110;
 	evt->features = CLOCK_EVT_FEAT_ONESHOT;
-	if (hdev->flags & HPET_DEV_PERI_CAP)
+	if (hdev->flags & HPET_DEV_PERI_CAP) {
 		evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+		evt->set_state_periodic = hpet_msi_set_periodic;
+	}
 
-	evt->set_mode = hpet_msi_set_mode;
+	evt->set_state_shutdown = hpet_msi_shutdown;
+	evt->set_state_oneshot = hpet_msi_set_oneshot;
+	evt->tick_resume = hpet_msi_resume;
 	evt->set_next_event = hpet_msi_next_event;
 	evt->cpumask = cpumask_of(hdev->cpu);
 
-- 
cgit v1.2.3-70-g09d2


From 37868fe113ff2ba814b3b4eb12df214df555f8dc Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 30 Jul 2015 14:31:32 -0700
Subject: x86/ldt: Make modify_ldt synchronous

modify_ldt() has questionable locking and does not synchronize
threads.  Improve it: redesign the locking and synchronize all
threads' LDTs using an IPI on all modifications.

This will dramatically slow down modify_ldt in multithreaded
programs, but there shouldn't be any multithreaded programs that
care about modify_ldt's performance in the first place.

This fixes some fallout from the CVE-2015-5157 fixes.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: security@kernel.org <security@kernel.org>
Cc: <stable@vger.kernel.org>
Cc: xen-devel <xen-devel@lists.xen.org>
Link: http://lkml.kernel.org/r/4c6978476782160600471bd865b318db34c7b628.1438291540.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/desc.h        |  15 ---
 arch/x86/include/asm/mmu.h         |   3 +-
 arch/x86/include/asm/mmu_context.h |  54 +++++++-
 arch/x86/kernel/cpu/common.c       |   4 +-
 arch/x86/kernel/cpu/perf_event.c   |  12 +-
 arch/x86/kernel/ldt.c              | 262 ++++++++++++++++++++-----------------
 arch/x86/kernel/process_64.c       |   4 +-
 arch/x86/kernel/step.c             |   6 +-
 arch/x86/power/cpu.c               |   3 +-
 9 files changed, 210 insertions(+), 153 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a0bf89fd2647..4e10d73cf018 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -280,21 +280,6 @@ static inline void clear_LDT(void)
 	set_ldt(NULL, 0);
 }
 
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
-	set_ldt(pc->ldt, pc->size);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
-	preempt_disable();
-	load_LDT_nolock(pc);
-	preempt_enable();
-}
-
 static inline unsigned long get_desc_base(const struct desc_struct *desc)
 {
 	return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 09b9620a73b4..364d27481a52 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,8 +9,7 @@
  * we put the segment information here.
  */
 typedef struct {
-	void *ldt;
-	int size;
+	struct ldt_struct *ldt;
 
 #ifdef CONFIG_X86_64
 	/* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 804a3a6030ca..984abfe47edc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -33,6 +33,50 @@ static inline void load_mm_cr4(struct mm_struct *mm)
 static inline void load_mm_cr4(struct mm_struct *mm) {}
 #endif
 
+/*
+ * ldt_structs can be allocated, used, and freed, but they are never
+ * modified while live.
+ */
+struct ldt_struct {
+	/*
+	 * Xen requires page-aligned LDTs with special permissions.  This is
+	 * needed to prevent us from installing evil descriptors such as
+	 * call gates.  On native, we could merge the ldt_struct and LDT
+	 * allocations, but it's not worth trying to optimize.
+	 */
+	struct desc_struct *entries;
+	int size;
+};
+
+static inline void load_mm_ldt(struct mm_struct *mm)
+{
+	struct ldt_struct *ldt;
+
+	/* lockless_dereference synchronizes with smp_store_release */
+	ldt = lockless_dereference(mm->context.ldt);
+
+	/*
+	 * Any change to mm->context.ldt is followed by an IPI to all
+	 * CPUs with the mm active.  The LDT will not be freed until
+	 * after the IPI is handled by all such CPUs.  This means that,
+	 * if the ldt_struct changes before we return, the values we see
+	 * will be safe, and the new values will be loaded before we run
+	 * any user code.
+	 *
+	 * NB: don't try to convert this to use RCU without extreme care.
+	 * We would still need IRQs off, because we don't want to change
+	 * the local LDT after an IPI loaded a newer value than the one
+	 * that we can see.
+	 */
+
+	if (unlikely(ldt))
+		set_ldt(ldt->entries, ldt->size);
+	else
+		clear_LDT();
+
+	DEBUG_LOCKS_WARN_ON(preemptible());
+}
+
 /*
  * Used for LDT copy/destruction.
  */
@@ -78,12 +122,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		 * was called and then modify_ldt changed
 		 * prev->context.ldt but suppressed an IPI to this CPU.
 		 * In this case, prev->context.ldt != NULL, because we
-		 * never free an LDT while the mm still exists.  That
-		 * means that next->context.ldt != prev->context.ldt,
-		 * because mms never share an LDT.
+		 * never set context.ldt to NULL while the mm still
+		 * exists.  That means that next->context.ldt !=
+		 * prev->context.ldt, because mms never share an LDT.
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
-			load_LDT_nolock(&next->context);
+			load_mm_ldt(next);
 	}
 #ifdef CONFIG_SMP
 	  else {
@@ -106,7 +150,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			load_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 			load_mm_cr4(next);
-			load_LDT_nolock(&next->context);
+			load_mm_ldt(next);
 		}
 	}
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 922c5e0cea4c..cb9e5df42dd2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1410,7 +1410,7 @@ void cpu_init(void)
 	load_sp0(t, &current->thread);
 	set_tss_desc(cpu, t);
 	load_TR_desc();
-	load_LDT(&init_mm.context);
+	load_mm_ldt(&init_mm);
 
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
@@ -1459,7 +1459,7 @@ void cpu_init(void)
 	load_sp0(t, thread);
 	set_tss_desc(cpu, t);
 	load_TR_desc();
-	load_LDT(&init_mm.context);
+	load_mm_ldt(&init_mm);
 
 	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3658de47900f..9469dfa55607 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2179,21 +2179,25 @@ static unsigned long get_segment_base(unsigned int segment)
 	int idx = segment >> 3;
 
 	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+		struct ldt_struct *ldt;
+
 		if (idx > LDT_ENTRIES)
 			return 0;
 
-		if (idx > current->active_mm->context.size)
+		/* IRQs are off, so this synchronizes with smp_store_release */
+		ldt = lockless_dereference(current->active_mm->context.ldt);
+		if (!ldt || idx > ldt->size)
 			return 0;
 
-		desc = current->active_mm->context.ldt;
+		desc = &ldt->entries[idx];
 	} else {
 		if (idx > GDT_ENTRIES)
 			return 0;
 
-		desc = raw_cpu_ptr(gdt_page.gdt);
+		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
 	}
 
-	return get_desc_base(desc + idx);
+	return get_desc_base(desc);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index c37886d759cc..2bcc0525f1c1 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
 
@@ -20,82 +21,82 @@
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
 
-#ifdef CONFIG_SMP
+/* context.lock is held for us, so we don't need any locking. */
 static void flush_ldt(void *current_mm)
 {
-	if (current->active_mm == current_mm)
-		load_LDT(&current->active_mm->context);
+	mm_context_t *pc;
+
+	if (current->active_mm != current_mm)
+		return;
+
+	pc = &current->active_mm->context;
+	set_ldt(pc->ldt->entries, pc->ldt->size);
 }
-#endif
 
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+static struct ldt_struct *alloc_ldt_struct(int size)
 {
-	void *oldldt, *newldt;
-	int oldsize;
-
-	if (mincount <= pc->size)
-		return 0;
-	oldsize = pc->size;
-	mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
-			(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
-	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+	struct ldt_struct *new_ldt;
+	int alloc_size;
+
+	if (size > LDT_ENTRIES)
+		return NULL;
+
+	new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+	if (!new_ldt)
+		return NULL;
+
+	BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+	alloc_size = size * LDT_ENTRY_SIZE;
+
+	/*
+	 * Xen is very picky: it requires a page-aligned LDT that has no
+	 * trailing nonzero bytes in any page that contains LDT descriptors.
+	 * Keep it simple: zero the whole allocation and never allocate less
+	 * than PAGE_SIZE.
+	 */
+	if (alloc_size > PAGE_SIZE)
+		new_ldt->entries = vzalloc(alloc_size);
 	else
-		newldt = (void *)__get_free_page(GFP_KERNEL);
-
-	if (!newldt)
-		return -ENOMEM;
+		new_ldt->entries = kzalloc(PAGE_SIZE, GFP_KERNEL);
 
-	if (oldsize)
-		memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
-	oldldt = pc->ldt;
-	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
-	       (mincount - oldsize) * LDT_ENTRY_SIZE);
+	if (!new_ldt->entries) {
+		kfree(new_ldt);
+		return NULL;
+	}
 
-	paravirt_alloc_ldt(newldt, mincount);
+	new_ldt->size = size;
+	return new_ldt;
+}
 
-#ifdef CONFIG_X86_64
-	/* CHECKME: Do we really need this ? */
-	wmb();
-#endif
-	pc->ldt = newldt;
-	wmb();
-	pc->size = mincount;
-	wmb();
-
-	if (reload) {
-#ifdef CONFIG_SMP
-		preempt_disable();
-		load_LDT(pc);
-		if (!cpumask_equal(mm_cpumask(current->mm),
-				   cpumask_of(smp_processor_id())))
-			smp_call_function(flush_ldt, current->mm, 1);
-		preempt_enable();
-#else
-		load_LDT(pc);
-#endif
-	}
-	if (oldsize) {
-		paravirt_free_ldt(oldldt, oldsize);
-		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(oldldt);
-		else
-			put_page(virt_to_page(oldldt));
-	}
-	return 0;
+/* After calling this, the LDT is immutable. */
+static void finalize_ldt_struct(struct ldt_struct *ldt)
+{
+	paravirt_alloc_ldt(ldt->entries, ldt->size);
 }
 
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+/* context.lock is held */
+static void install_ldt(struct mm_struct *current_mm,
+			struct ldt_struct *ldt)
 {
-	int err = alloc_ldt(new, old->size, 0);
-	int i;
+	/* Synchronizes with lockless_dereference in load_mm_ldt. */
+	smp_store_release(&current_mm->context.ldt, ldt);
+
+	/* Activate the LDT for all CPUs using current_mm. */
+	on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
+}
 
-	if (err < 0)
-		return err;
+static void free_ldt_struct(struct ldt_struct *ldt)
+{
+	if (likely(!ldt))
+		return;
 
-	for (i = 0; i < old->size; i++)
-		write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
-	return 0;
+	paravirt_free_ldt(ldt->entries, ldt->size);
+	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+		vfree(ldt->entries);
+	else
+		kfree(ldt->entries);
+	kfree(ldt);
 }
 
 /*
@@ -104,17 +105,37 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
  */
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
+	struct ldt_struct *new_ldt;
 	struct mm_struct *old_mm;
 	int retval = 0;
 
 	mutex_init(&mm->context.lock);
-	mm->context.size = 0;
 	old_mm = current->mm;
-	if (old_mm && old_mm->context.size > 0) {
-		mutex_lock(&old_mm->context.lock);
-		retval = copy_ldt(&mm->context, &old_mm->context);
-		mutex_unlock(&old_mm->context.lock);
+	if (!old_mm) {
+		mm->context.ldt = NULL;
+		return 0;
 	}
+
+	mutex_lock(&old_mm->context.lock);
+	if (!old_mm->context.ldt) {
+		mm->context.ldt = NULL;
+		goto out_unlock;
+	}
+
+	new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+	if (!new_ldt) {
+		retval = -ENOMEM;
+		goto out_unlock;
+	}
+
+	memcpy(new_ldt->entries, old_mm->context.ldt->entries,
+	       new_ldt->size * LDT_ENTRY_SIZE);
+	finalize_ldt_struct(new_ldt);
+
+	mm->context.ldt = new_ldt;
+
+out_unlock:
+	mutex_unlock(&old_mm->context.lock);
 	return retval;
 }
 
@@ -125,53 +146,47 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
  */
 void destroy_context(struct mm_struct *mm)
 {
-	if (mm->context.size) {
-#ifdef CONFIG_X86_32
-		/* CHECKME: Can this ever happen ? */
-		if (mm == current->active_mm)
-			clear_LDT();
-#endif
-		paravirt_free_ldt(mm->context.ldt, mm->context.size);
-		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(mm->context.ldt);
-		else
-			put_page(virt_to_page(mm->context.ldt));
-		mm->context.size = 0;
-	}
+	free_ldt_struct(mm->context.ldt);
+	mm->context.ldt = NULL;
 }
 
 static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
-	int err;
+	int retval;
 	unsigned long size;
 	struct mm_struct *mm = current->mm;
 
-	if (!mm->context.size)
-		return 0;
+	mutex_lock(&mm->context.lock);
+
+	if (!mm->context.ldt) {
+		retval = 0;
+		goto out_unlock;
+	}
+
 	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
 		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
 
-	mutex_lock(&mm->context.lock);
-	size = mm->context.size * LDT_ENTRY_SIZE;
+	size = mm->context.ldt->size * LDT_ENTRY_SIZE;
 	if (size > bytecount)
 		size = bytecount;
 
-	err = 0;
-	if (copy_to_user(ptr, mm->context.ldt, size))
-		err = -EFAULT;
-	mutex_unlock(&mm->context.lock);
-	if (err < 0)
-		goto error_return;
+	if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+		retval = -EFAULT;
+		goto out_unlock;
+	}
+
 	if (size != bytecount) {
-		/* zero-fill the rest */
-		if (clear_user(ptr + size, bytecount - size) != 0) {
-			err = -EFAULT;
-			goto error_return;
+		/* Zero-fill the rest and pretend we read bytecount bytes. */
+		if (clear_user(ptr + size, bytecount - size)) {
+			retval = -EFAULT;
+			goto out_unlock;
 		}
 	}
-	return bytecount;
-error_return:
-	return err;
+	retval = bytecount;
+
+out_unlock:
+	mutex_unlock(&mm->context.lock);
+	return retval;
 }
 
 static int read_default_ldt(void __user *ptr, unsigned long bytecount)
@@ -195,6 +210,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	struct desc_struct ldt;
 	int error;
 	struct user_desc ldt_info;
+	int oldsize, newsize;
+	struct ldt_struct *new_ldt, *old_ldt;
 
 	error = -EINVAL;
 	if (bytecount != sizeof(ldt_info))
@@ -213,34 +230,39 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 			goto out;
 	}
 
-	mutex_lock(&mm->context.lock);
-	if (ldt_info.entry_number >= mm->context.size) {
-		error = alloc_ldt(&current->mm->context,
-				  ldt_info.entry_number + 1, 1);
-		if (error < 0)
-			goto out_unlock;
-	}
-
-	/* Allow LDTs to be cleared by the user. */
-	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
-		if (oldmode || LDT_empty(&ldt_info)) {
-			memset(&ldt, 0, sizeof(ldt));
-			goto install;
+	if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
+	    LDT_empty(&ldt_info)) {
+		/* The user wants to clear the entry. */
+		memset(&ldt, 0, sizeof(ldt));
+	} else {
+		if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+			error = -EINVAL;
+			goto out;
 		}
+
+		fill_ldt(&ldt, &ldt_info);
+		if (oldmode)
+			ldt.avl = 0;
 	}
 
-	if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
-		error = -EINVAL;
+	mutex_lock(&mm->context.lock);
+
+	old_ldt = mm->context.ldt;
+	oldsize = old_ldt ? old_ldt->size : 0;
+	newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+
+	error = -ENOMEM;
+	new_ldt = alloc_ldt_struct(newsize);
+	if (!new_ldt)
 		goto out_unlock;
-	}
 
-	fill_ldt(&ldt, &ldt_info);
-	if (oldmode)
-		ldt.avl = 0;
+	if (old_ldt)
+		memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+	new_ldt->entries[ldt_info.entry_number] = ldt;
+	finalize_ldt_struct(new_ldt);
 
-	/* Install the new entry ...  */
-install:
-	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+	install_ldt(mm, new_ldt);
+	free_ldt_struct(old_ldt);
 	error = 0;
 
 out_unlock:
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 71d7849a07f7..f6b916387590 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,11 +121,11 @@ void __show_regs(struct pt_regs *regs, int all)
 void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
-		if (dead_task->mm->context.size) {
+		if (dead_task->mm->context.ldt) {
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 				dead_task->comm,
 				dead_task->mm->context.ldt,
-				dead_task->mm->context.size);
+				dead_task->mm->context.ldt->size);
 			BUG();
 		}
 	}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 9b4d51d0c0d0..6273324186ac 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <asm/desc.h>
+#include <asm/mmu_context.h>
 
 unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
 {
@@ -30,10 +31,11 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		seg &= ~7UL;
 
 		mutex_lock(&child->mm->context.lock);
-		if (unlikely((seg >> 3) >= child->mm->context.size))
+		if (unlikely(!child->mm->context.ldt ||
+			     (seg >> 3) >= child->mm->context.ldt->size))
 			addr = -1L; /* bogus selector, access would fault */
 		else {
-			desc = child->mm->context.ldt + seg;
+			desc = &child->mm->context.ldt->entries[seg];
 			base = get_desc_base(desc);
 
 			/* 16-bit code segment? */
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 0d7dd1f5ac36..9ab52791fed5 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -22,6 +22,7 @@
 #include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
 #include <asm/cpu.h>
+#include <asm/mmu_context.h>
 
 #ifdef CONFIG_X86_32
 __visible unsigned long saved_context_ebx;
@@ -153,7 +154,7 @@ static void fix_processor_context(void)
 	syscall_init();				/* This sets MSR_*STAR and related */
 #endif
 	load_TR_desc();				/* This does ltr */
-	load_LDT(&current->active_mm->context);	/* This does lldt */
+	load_mm_ldt(current->active_mm);	/* This does lldt */
 
 	fpu__resume_cpu();
 }
-- 
cgit v1.2.3-70-g09d2


From 7b868e4802a86d867aad1be0471b5767d9c20e10 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:18 +0200
Subject: uprobes/x86: Reimplement arch_uretprobe_is_alive()

Add the x86 specific version of arch_uretprobe_is_alive()
helper. It returns true if the stack frame mangled by
prepare_uretprobe() is still on stack. So if it returns false,
we know that the probed function has already returned.

We add the new return_instance->stack member and change the
generic code to initialize it in prepare_uretprobe, but it
should be equally useful for other architectures.

TODO: this assumes that the probed application can't use
      multiple stacks (say sigaltstack). We will try to improve
      this logic later.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134018.GA4766@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 5 +++++
 include/linux/uprobes.h   | 1 +
 kernel/events/uprobes.c   | 1 +
 3 files changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 66476244731e..58e9b842633f 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -985,3 +985,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 
 	return -1;
 }
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+{
+	return regs->sp <= ret->stack;
+}
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 50d2764d66a8..7ab6d2c8be49 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -95,6 +95,7 @@ struct uprobe_task {
 struct return_instance {
 	struct uprobe		*uprobe;
 	unsigned long		func;
+	unsigned long		stack;		/* stack pointer */
 	unsigned long		orig_ret_vaddr; /* original return address */
 	bool			chained;	/* true, if instance is nested */
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1c71b6242a7e..c5f316e06dc0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1562,6 +1562,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 
 	ri->uprobe = get_uprobe(uprobe);
 	ri->func = instruction_pointer(regs);
+	ri->stack = user_stack_pointer(regs);
 	ri->orig_ret_vaddr = orig_ret_vaddr;
 	ri->chained = chained;
 
-- 
cgit v1.2.3-70-g09d2


From 86dcb702e74b8ab7d3b2d36984ef00671cea73b9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:26 +0200
Subject: uprobes: Add the "enum rp_check ctx" arg to arch_uretprobe_is_alive()

arch/x86 doesn't care (so far), but as Pratyush Anand pointed
out other architectures might want why arch_uretprobe_is_alive()
was called and use different checks depending on the context.
Add the new argument to distinguish 2 callers.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134026.GA4779@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 3 ++-
 include/linux/uprobes.h   | 7 ++++++-
 kernel/events/uprobes.c   | 9 ++++++---
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 58e9b842633f..acf8b9010bbf 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -986,7 +986,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 	return -1;
 }
 
-bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+				struct pt_regs *regs)
 {
 	return regs->sp <= ret->stack;
 }
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 7ab6d2c8be49..c0a540239ab6 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -102,6 +102,11 @@ struct return_instance {
 	struct return_instance	*next;		/* keep as stack */
 };
 
+enum rp_check {
+	RP_CHECK_CALL,
+	RP_CHECK_RET,
+};
+
 struct xol_area;
 
 struct uprobes_state {
@@ -138,7 +143,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs);
-extern bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs);
+extern bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs);
 extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 					 void *src, unsigned long len);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7e61c8ca27e0..df5661a44e35 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1514,7 +1514,9 @@ static unsigned long get_trampoline_vaddr(void)
 static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs)
 {
 	struct return_instance *ri = utask->return_instances;
-	while (ri && !arch_uretprobe_is_alive(ri, regs)) {
+	enum rp_check ctx = RP_CHECK_CALL;
+
+	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
 		ri = free_ret_instance(ri);
 		utask->depth--;
 	}
@@ -1805,7 +1807,7 @@ static void handle_trampoline(struct pt_regs *regs)
 		 * could hit this trampoline on return. TODO: sigaltstack().
 		 */
 		next = find_next_ret_chain(ri);
-		valid = !next || arch_uretprobe_is_alive(next, regs);
+		valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
 
 		instruction_pointer_set(regs, ri->orig_ret_vaddr);
 		do {
@@ -1830,7 +1832,8 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
 	return false;
 }
 
-bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+					struct pt_regs *regs)
 {
 	return true;
 }
-- 
cgit v1.2.3-70-g09d2


From db087ef69a2b155ae001665bf0b3806abde7ee34 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:28 +0200
Subject: uprobes/x86: Make arch_uretprobe_is_alive(RP_CHECK_CALL) more clever

The previous change documents that cleanup_return_instances()
can't always detect the dead frames, the stack can grow. But
there is one special case which imho worth fixing:
arch_uretprobe_is_alive() can return true when the stack didn't
actually grow, but the next "call" insn uses the already
invalidated frame.

Test-case:

	#include <stdio.h>
	#include <setjmp.h>

	jmp_buf jmp;
	int nr = 1024;

	void func_2(void)
	{
		if (--nr == 0)
			return;
		longjmp(jmp, 1);
	}

	void func_1(void)
	{
		setjmp(jmp);
		func_2();
	}

	int main(void)
	{
		func_1();
		return 0;
	}

If you ret-probe func_1() and func_2() prepare_uretprobe() hits
the MAX_URETPROBE_DEPTH limit and "return" from func_2() is not
reported.

When we know that the new call is not chained, we can do the
more strict check. In this case "sp" points to the new ret-addr,
so every frame which uses the same "sp" must be dead. The only
complication is that arch_uretprobe_is_alive() needs to know was
it chained or not, so we add the new RP_CHECK_CHAIN_CALL enum
and change prepare_uretprobe() to pass RP_CHECK_CALL only if
!chained.

Note: arch_uretprobe_is_alive() could also re-read *sp and check
if this word is still trampoline_vaddr. This could obviously
improve the logic, but I would like to avoid another
copy_from_user() especially in the case when we can't avoid the
false "alive == T" positives.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134028.GA4786@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c |  5 ++++-
 include/linux/uprobes.h   |  1 +
 kernel/events/uprobes.c   | 14 +++++++-------
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index acf8b9010bbf..bf4db6eaec8f 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -989,5 +989,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
 				struct pt_regs *regs)
 {
-	return regs->sp <= ret->stack;
+	if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+		return regs->sp < ret->stack;
+	else
+		return regs->sp <= ret->stack;
 }
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index c0a540239ab6..0bdc72f36905 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -104,6 +104,7 @@ struct return_instance {
 
 enum rp_check {
 	RP_CHECK_CALL,
+	RP_CHECK_CHAIN_CALL,
 	RP_CHECK_RET,
 };
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index df5661a44e35..0f370ef57a02 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1511,10 +1511,11 @@ static unsigned long get_trampoline_vaddr(void)
 	return trampoline_vaddr;
 }
 
-static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs)
+static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
+					struct pt_regs *regs)
 {
 	struct return_instance *ri = utask->return_instances;
-	enum rp_check ctx = RP_CHECK_CALL;
+	enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
 
 	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
 		ri = free_ret_instance(ri);
@@ -1528,7 +1529,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 	struct return_instance *ri;
 	struct uprobe_task *utask;
 	unsigned long orig_ret_vaddr, trampoline_vaddr;
-	bool chained = false;
+	bool chained;
 
 	if (!get_xol_area())
 		return;
@@ -1554,14 +1555,15 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 		goto fail;
 
 	/* drop the entries invalidated by longjmp() */
-	cleanup_return_instances(utask, regs);
+	chained = (orig_ret_vaddr == trampoline_vaddr);
+	cleanup_return_instances(utask, chained, regs);
 
 	/*
 	 * We don't want to keep trampoline address in stack, rather keep the
 	 * original return address of first caller thru all the consequent
 	 * instances. This also makes breakpoint unwrapping easier.
 	 */
-	if (orig_ret_vaddr == trampoline_vaddr) {
+	if (chained) {
 		if (!utask->return_instances) {
 			/*
 			 * This situation is not possible. Likely we have an
@@ -1570,8 +1572,6 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 			uprobe_warn(current, "handle tail call");
 			goto fail;
 		}
-
-		chained = true;
 		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From a5b9e5a2f14f25a8dae987494d50ad3aac7366b6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 30 Jul 2015 14:31:34 -0700
Subject: x86/ldt: Make modify_ldt() optional

The modify_ldt syscall exposes a large attack surface and is
unnecessary for modern userspace.  Make it optional.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: security@kernel.org <security@kernel.org>
Cc: xen-devel <xen-devel@lists.xen.org>
Link: http://lkml.kernel.org/r/a605166a771c343fd64802dece77a903507333bd.1438291540.git.luto@kernel.org
[ Made MATH_EMULATION dependent on MODIFY_LDT_SYSCALL. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                   | 18 ++++++++++++++++++
 arch/x86/include/asm/mmu.h         |  2 ++
 arch/x86/include/asm/mmu_context.h | 28 +++++++++++++++++++++-------
 arch/x86/kernel/Makefile           |  3 ++-
 arch/x86/kernel/cpu/perf_event.c   |  4 ++++
 arch/x86/kernel/process_64.c       |  2 ++
 arch/x86/kernel/step.c             |  2 ++
 kernel/sys_ni.c                    |  1 +
 8 files changed, 52 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e26fe7a5b9e6..798658048db3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1036,6 +1036,7 @@ config VM86
 config X86_16BIT
 	bool "Enable support for 16-bit segments" if EXPERT
 	default y
+	depends on MODIFY_LDT_SYSCALL
 	---help---
 	  This option is required by programs like Wine to run 16-bit
 	  protected mode legacy code on x86 processors.  Disabling
@@ -1530,6 +1531,7 @@ config X86_RESERVE_LOW
 
 config MATH_EMULATION
 	bool
+	depends on MODIFY_LDT_SYSCALL
 	prompt "Math emulation" if X86_32
 	---help---
 	  Linux can emulate a math coprocessor (used for floating point
@@ -2074,6 +2076,22 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+config MODIFY_LDT_SYSCALL
+	bool "Enable the LDT (local descriptor table)" if EXPERT
+	default y
+	---help---
+	  Linux can allow user programs to install a per-process x86
+	  Local Descriptor Table (LDT) using the modify_ldt(2) system
+	  call.  This is required to run 16-bit or segmented code such as
+	  DOSEMU or some Wine programs.  It is also used by some very old
+	  threading libraries.
+
+	  Enabling this feature adds a small amount of overhead to
+	  context switches and increases the low-level kernel attack
+	  surface.  Disabling it removes the modify_ldt(2) system call.
+
+	  Saying 'N' here may make sense for embedded or server kernels.
+
 source "kernel/livepatch/Kconfig"
 
 endmenu
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 364d27481a52..55234d5e7160 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,7 +9,9 @@
  * we put the segment information here.
  */
 typedef struct {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
+#endif
 
 #ifdef CONFIG_X86_64
 	/* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 984abfe47edc..379cd3658799 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm)
 static inline void load_mm_cr4(struct mm_struct *mm) {}
 #endif
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 /*
  * ldt_structs can be allocated, used, and freed, but they are never
  * modified while live.
@@ -48,8 +49,23 @@ struct ldt_struct {
 	int size;
 };
 
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+#else	/* CONFIG_MODIFY_LDT_SYSCALL */
+static inline int init_new_context(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	return 0;
+}
+static inline void destroy_context(struct mm_struct *mm) {}
+#endif
+
 static inline void load_mm_ldt(struct mm_struct *mm)
 {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 
 	/* lockless_dereference synchronizes with smp_store_release */
@@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 		set_ldt(ldt->entries, ldt->size);
 	else
 		clear_LDT();
+#else
+	clear_LDT();
+#endif
 
 	DEBUG_LOCKS_WARN_ON(preemptible());
 }
 
-/*
- * Used for LDT copy/destruction.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
-
-
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #ifdef CONFIG_SMP
@@ -114,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/* Load per-mm CR4 state */
 		load_mm_cr4(next);
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		/*
 		 * Load the LDT, if the LDT is different.
 		 *
@@ -128,6 +141,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
 			load_mm_ldt(next);
+#endif
 	}
 #ifdef CONFIG_SMP
 	  else {
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index dc19730ad0db..514064897d55 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,8 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
 obj-y			:= process_$(BITS).o signal.o
 obj-$(CONFIG_COMPAT)	+= signal_compat.o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
+obj-y			+= time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 09f9ff271df4..cc25c8f3512d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	int idx = segment >> 3;
 
 	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		struct ldt_struct *ldt;
 
 		if (idx > LDT_ENTRIES)
@@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment)
 			return 0;
 
 		desc = &ldt->entries[idx];
+#else
+		return 0;
+#endif
 	} else {
 		if (idx > GDT_ENTRIES)
 			return 0;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 7ff035c6c54d..3c1bbcf12924 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all)
 void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		if (dead_task->mm->context.ldt) {
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 				dead_task->comm,
@@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task)
 				dead_task->mm->context.ldt->size);
 			BUG();
 		}
+#endif
 	}
 }
 
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 6273324186ac..fd88e152d584 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		return addr;
 	}
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	/*
 	 * We'll assume that the code segments in the GDT
 	 * are all zero-based. That is largely true: the
@@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		}
 		mutex_unlock(&child->mm->context.lock);
 	}
+#endif
 
 	return addr;
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5868d8..ca7d84f438f1 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
 cond_syscall(sys_ssetmask);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(sys_modify_ldt);
 cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
-- 
cgit v1.2.3-70-g09d2


From 9fda6a0681e070b496235b132bc70ceb80300211 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:16 -0400
Subject: x86/vm86: Move vm86 fields out of 'thread_struct'

Allocate a separate structure for the vm86 fields.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-2-git-send-email-brgerst@gmail.com
[ Build fixes. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 11 +++-------
 arch/x86/include/asm/vm86.h      | 19 ++++++++++++++++-
 arch/x86/kernel/process.c        |  3 +++
 arch/x86/kernel/vm86_32.c        | 46 +++++++++++++++++++++++-----------------
 arch/x86/mm/fault.c              |  6 ++++--
 5 files changed, 55 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index befc1341f110..9615a4e2645e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -6,8 +6,8 @@
 /* Forward declaration, a strange C thing */
 struct task_struct;
 struct mm_struct;
+struct vm86;
 
-#include <asm/vm86.h>
 #include <asm/math_emu.h>
 #include <asm/segment.h>
 #include <asm/types.h>
@@ -400,13 +400,9 @@ struct thread_struct {
 	unsigned long		cr2;
 	unsigned long		trap_nr;
 	unsigned long		error_code;
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_VM86
 	/* Virtual 86 mode info */
-	struct vm86plus_struct __user *vm86_info;
-	unsigned long		screen_bitmap;
-	unsigned long		v86flags;
-	unsigned long		v86mask;
-	unsigned long		saved_sp0;
+	struct vm86		*vm86;
 #endif
 	/* IO permissions: */
 	unsigned long		*io_bitmap_ptr;
@@ -718,7 +714,6 @@ static inline void spin_lock_prefetch(const void *x)
 
 #define INIT_THREAD  {							  \
 	.sp0			= TOP_OF_INIT_STACK,			  \
-	.vm86_info		= NULL,					  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.io_bitmap_ptr		= NULL,					  \
 }
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 1d8de3f3feca..20b43b7a950b 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_VM86_H
 #define _ASM_X86_VM86_H
 
-
 #include <asm/ptrace.h>
 #include <uapi/asm/vm86.h>
 
@@ -58,6 +57,14 @@ struct kernel_vm86_struct {
  */
 };
 
+struct vm86 {
+	struct vm86plus_struct __user *vm86_info;
+	unsigned long screen_bitmap;
+	unsigned long v86flags;
+	unsigned long v86mask;
+	unsigned long saved_sp0;
+};
+
 #ifdef CONFIG_VM86
 
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
@@ -67,6 +74,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
 struct task_struct;
 void release_vm86_irqs(struct task_struct *);
 
+#define free_vm86(t) do {				\
+	struct thread_struct *__t = (t);		\
+	if (__t->vm86 != NULL) {			\
+		kfree(__t->vm86);			\
+		__t->vm86 = NULL;			\
+	}						\
+} while (0)
+
 #else
 
 #define handle_vm86_fault(a, b)
@@ -77,6 +92,8 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
 	return 0;
 }
 
+#define free_vm86(t) do { } while(0)
+
 #endif /* CONFIG_VM86 */
 
 #endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 397688beed4b..2199d9b774c8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -29,6 +29,7 @@
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
+#include <asm/vm86.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -110,6 +111,8 @@ void exit_thread(void)
 		kfree(bp);
 	}
 
+	free_vm86(t);
+
 	fpu__drop(fpu);
 }
 
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e6c2b47ec261..bfa59b1400b9 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -44,6 +44,7 @@
 #include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/stddef.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -81,8 +82,8 @@
 /*
  * virtual flags (16 and 32-bit versions)
  */
-#define VFLAGS	(*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS	(current->thread.v86flags)
+#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->v86flags))
+#define VEFLAGS	(current->thread.vm86->v86flags)
 
 #define set_flags(X, new, mask) \
 ((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -96,6 +97,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	struct pt_regs *ret;
 	struct task_struct *tsk = current;
 	struct vm86plus_struct __user *user;
+	struct vm86 *vm86 = current->thread.vm86;
 	long err = 0;
 
 	/*
@@ -105,12 +107,12 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	 */
 	local_irq_enable();
 
-	if (!tsk->thread.vm86_info) {
+	if (!vm86 || !vm86->vm86_info) {
 		pr_alert("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | tsk->thread.v86mask);
-	user = tsk->thread.vm86_info;
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask);
+	user = vm86->vm86_info;
 
 	if (!access_ok(VERIFY_WRITE, user, VMPI.is_vm86pus ?
 		       sizeof(struct vm86plus_struct) :
@@ -137,7 +139,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 		put_user_ex(regs->fs, &user->regs.fs);
 		put_user_ex(regs->gs, &user->regs.gs);
 
-		put_user_ex(tsk->thread.screen_bitmap, &user->screen_bitmap);
+		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
 	} put_user_catch(err);
 	if (err) {
 		pr_alert("could not access userspace vm86_info\n");
@@ -145,10 +147,10 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	}
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	tsk->thread.sp0 = tsk->thread.saved_sp0;
+	tsk->thread.sp0 = vm86->saved_sp0;
 	tsk->thread.sysenter_cs = __KERNEL_CS;
 	load_sp0(tss, &tsk->thread);
-	tsk->thread.saved_sp0 = 0;
+	vm86->saved_sp0 = 0;
 	put_cpu();
 
 	ret = KVM86->regs32;
@@ -242,9 +244,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 {
 	struct tss_struct *tss;
 	struct task_struct *tsk = current;
+	struct vm86 *vm86 = tsk->thread.vm86;
 	unsigned long err = 0;
 
-	if (tsk->thread.saved_sp0)
+	if (!vm86) {
+		if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+			return -ENOMEM;
+		tsk->thread.vm86 = vm86;
+	}
+	if (vm86->saved_sp0)
 		return -EPERM;
 
 	if (!access_ok(VERIFY_READ, v86, plus ?
@@ -295,7 +303,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 	}
 
 	info->regs32 = current_pt_regs();
-	tsk->thread.vm86_info = v86;
+	vm86->vm86_info = v86;
 
 /*
  * The flags register is also special: we cannot trust that the user
@@ -311,16 +319,16 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 
 	switch (info->cpu_type) {
 	case CPU_286:
-		tsk->thread.v86mask = 0;
+		vm86->v86mask = 0;
 		break;
 	case CPU_386:
-		tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	case CPU_486:
-		tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	default:
-		tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	}
 
@@ -328,7 +336,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
  * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
  */
 	info->regs32->ax = VM86_SIGNAL;
-	tsk->thread.saved_sp0 = tsk->thread.sp0;
+	vm86->saved_sp0 = tsk->thread.sp0;
 	lazy_save_gs(info->regs32->gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
@@ -338,7 +346,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 	load_sp0(tss, &tsk->thread);
 	put_cpu();
 
-	tsk->thread.screen_bitmap = info->screen_bitmap;
+	vm86->screen_bitmap = info->screen_bitmap;
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
@@ -408,7 +416,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
 
 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VEFLAGS, flags, current->thread.v86mask);
+	set_flags(VEFLAGS, flags, current->thread.vm86->v86mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -418,7 +426,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
 
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VFLAGS, flags, current->thread.v86mask);
+	set_flags(VFLAGS, flags, current->thread.vm86->v86mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -433,7 +441,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
 	if (VEFLAGS & X86_EFLAGS_VIF)
 		flags |= X86_EFLAGS_IF;
 	flags |= X86_EFLAGS_IOPL;
-	return flags | (VEFLAGS & current->thread.v86mask);
+	return flags | (VEFLAGS & current->thread.vm86->v86mask);
 }
 
 static inline int is_revectored(int nr, struct revectored_struct *bitmap)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9dc909841739..34a368d2d533 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -301,14 +301,16 @@ static inline void
 check_v8086_mode(struct pt_regs *regs, unsigned long address,
 		 struct task_struct *tsk)
 {
+#ifdef CONFIG_VM86
 	unsigned long bit;
 
-	if (!v8086_mode(regs))
+	if (!v8086_mode(regs) || !tsk->thread.vm86)
 		return;
 
 	bit = (address - 0xA0000) >> PAGE_SHIFT;
 	if (bit < 32)
-		tsk->thread.screen_bitmap |= 1 << bit;
+		tsk->thread.vm86->screen_bitmap |= 1 << bit;
+#endif
 }
 
 static bool low_pfn(unsigned long pfn)
-- 
cgit v1.2.3-70-g09d2


From d4ce0f26c790af8e829d3fad0a6787f40f98e24f Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:17 -0400
Subject: x86/vm86: Move fields from 'struct kernel_vm86_struct' to 'struct
 vm86'

Move the non-regs fields to the off-stack data.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-3-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/vm86.h | 16 ++++++++--------
 arch/x86/kernel/vm86_32.c   | 42 ++++++++++++++++++++++--------------------
 2 files changed, 30 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 20b43b7a950b..47c7648338ee 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -37,13 +37,7 @@ struct kernel_vm86_struct {
  * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
  * in kernelspace, hence we need not reget the data from userspace.
  */
-#define VM86_TSS_ESP0 flags
-	unsigned long flags;
-	unsigned long screen_bitmap;
-	unsigned long cpu_type;
-	struct revectored_struct int_revectored;
-	struct revectored_struct int21_revectored;
-	struct vm86plus_info_struct vm86plus;
+#define VM86_TSS_ESP0 regs32
 	struct pt_regs *regs32;   /* here we save the pointer to the old regs */
 /*
  * The below is not part of the structure, but the stack layout continues
@@ -59,10 +53,16 @@ struct kernel_vm86_struct {
 
 struct vm86 {
 	struct vm86plus_struct __user *vm86_info;
-	unsigned long screen_bitmap;
 	unsigned long v86flags;
 	unsigned long v86mask;
 	unsigned long saved_sp0;
+
+	unsigned long flags;
+	unsigned long screen_bitmap;
+	unsigned long cpu_type;
+	struct revectored_struct int_revectored;
+	struct revectored_struct int21_revectored;
+	struct vm86plus_info_struct vm86plus;
 };
 
 #ifdef CONFIG_VM86
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index bfa59b1400b9..f71b4b9452f1 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -68,7 +68,6 @@
 
 
 #define KVM86	((struct kernel_vm86_struct *)regs)
-#define VMPI	KVM86->vm86plus
 
 
 /*
@@ -114,7 +113,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask);
 	user = vm86->vm86_info;
 
-	if (!access_ok(VERIFY_WRITE, user, VMPI.is_vm86pus ?
+	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
 		       sizeof(struct vm86plus_struct) :
 		       sizeof(struct vm86_struct))) {
 		pr_alert("could not access userspace vm86_info\n");
@@ -282,25 +281,27 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 		get_user_ex(info->regs.fs, &v86->regs.fs);
 		get_user_ex(info->regs.gs, &v86->regs.gs);
 
-		get_user_ex(info->flags, &v86->flags);
-		get_user_ex(info->screen_bitmap, &v86->screen_bitmap);
-		get_user_ex(info->cpu_type, &v86->cpu_type);
+		get_user_ex(vm86->flags, &v86->flags);
+		get_user_ex(vm86->screen_bitmap, &v86->screen_bitmap);
+		get_user_ex(vm86->cpu_type, &v86->cpu_type);
 	} get_user_catch(err);
 	if (err)
 		return err;
 
-	if (copy_from_user(&info->int_revectored, &v86->int_revectored,
+	if (copy_from_user(&vm86->int_revectored, &v86->int_revectored,
 			   sizeof(struct revectored_struct)))
 		return -EFAULT;
-	if (copy_from_user(&info->int21_revectored, &v86->int21_revectored,
+	if (copy_from_user(&vm86->int21_revectored, &v86->int21_revectored,
 			   sizeof(struct revectored_struct)))
 		return -EFAULT;
 	if (plus) {
-		if (copy_from_user(&info->vm86plus, &v86->vm86plus,
+		if (copy_from_user(&vm86->vm86plus, &v86->vm86plus,
 				   sizeof(struct vm86plus_info_struct)))
 			return -EFAULT;
-		info->vm86plus.is_vm86pus = 1;
-	}
+		vm86->vm86plus.is_vm86pus = 1;
+	} else
+		memset(&vm86->vm86plus, 0,
+		       sizeof(struct vm86plus_info_struct));
 
 	info->regs32 = current_pt_regs();
 	vm86->vm86_info = v86;
@@ -317,7 +318,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 
 	info->regs.pt.orig_ax = info->regs32->orig_ax;
 
-	switch (info->cpu_type) {
+	switch (vm86->cpu_type) {
 	case CPU_286:
 		vm86->v86mask = 0;
 		break;
@@ -346,8 +347,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 	load_sp0(tss, &tsk->thread);
 	put_cpu();
 
-	vm86->screen_bitmap = info->screen_bitmap;
-	if (info->flags & VM86_SCREEN_BITMAP)
+	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
 	/*call __audit_syscall_exit since we do not exit via the normal paths */
@@ -539,12 +539,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 {
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
+	struct kernel_vm86_info *vm86 = current->thread.vm86;
 
 	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
-	if (is_revectored(i, &KVM86->int_revectored))
+	if (is_revectored(i, &vm86->int_revectored))
 		goto cannot_handle;
-	if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
+	if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
 		goto cannot_handle;
 	intr_ptr = (unsigned long __user *) (i << 2);
 	if (get_user(segoffs, intr_ptr))
@@ -568,7 +569,7 @@ cannot_handle:
 
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
-	if (VMPI.is_vm86pus) {
+	if (current->thread.vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
 			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
 			/* setting this flag forces the code in entry_32.S to
@@ -595,12 +596,13 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	unsigned char __user *ssp;
 	unsigned short ip, sp, orig_flags;
 	int data32, pref_done;
+	struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
 
 #define CHECK_IF_IN_TRAP \
-	if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+	if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
 		newflags |= X86_EFLAGS_TF
 #define VM86_FAULT_RETURN do { \
-	if (VMPI.force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
+	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
 		return_to_32bit(regs, VM86_PICRETURN); \
 	if (orig_flags & X86_EFLAGS_TF) \
 		handle_vm86_trap(regs, 0, 1); \
@@ -670,8 +672,8 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	case 0xcd: {
 		int intno = popb(csp, ip, simulate_sigsegv);
 		IP(regs) = ip;
-		if (VMPI.vm86dbg_active) {
-			if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
+		if (vmpi->vm86dbg_active) {
+			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3])
 				return_to_32bit(regs, VM86_INTx + (intno << 8));
 		}
 		do_int(regs, intno, ssp, sp);
-- 
cgit v1.2.3-70-g09d2


From 90c6085a248f8f964588617f51329688bcc9f2bc Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:18 -0400
Subject: x86/vm86: Eliminate 'struct kernel_vm86_struct'

Now there is no vm86-specific data left on the kernel stack
while in userspace, except for the 32-bit regs.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-4-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/vm86.h | 25 +-----------
 arch/x86/kernel/vm86_32.c   | 95 +++++++++++++++++++--------------------------
 2 files changed, 42 insertions(+), 78 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 47c7648338ee..226d6c157ebc 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -27,32 +27,9 @@ struct kernel_vm86_regs {
 	unsigned short gs, __gsh;
 };
 
-struct kernel_vm86_struct {
-	struct kernel_vm86_regs regs;
-/*
- * the below part remains on the kernel stack while we are in VM86 mode.
- * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
- * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
- * 'struct kernel_vm86_regs' with the then actual values.
- * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
- * in kernelspace, hence we need not reget the data from userspace.
- */
-#define VM86_TSS_ESP0 regs32
-	struct pt_regs *regs32;   /* here we save the pointer to the old regs */
-/*
- * The below is not part of the structure, but the stack layout continues
- * this way. In front of 'return-eip' may be some data, depending on
- * compilation, so we don't rely on this and save the pointer to 'oldregs'
- * in 'regs32' above.
- * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
-
-	long return-eip;        from call to vm86()
-	struct pt_regs oldregs;  user space registers as saved by syscall
- */
-};
-
 struct vm86 {
 	struct vm86plus_struct __user *vm86_info;
+	struct pt_regs *regs32;
 	unsigned long v86flags;
 	unsigned long v86mask;
 	unsigned long saved_sp0;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index f71b4b9452f1..696ef767f2be 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -67,9 +67,6 @@
  */
 
 
-#define KVM86	((struct kernel_vm86_struct *)regs)
-
-
 /*
  * 8- and 16-bit register defines..
  */
@@ -152,7 +149,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	vm86->saved_sp0 = 0;
 	put_cpu();
 
-	ret = KVM86->regs32;
+	ret = vm86->regs32;
 
 	lazy_load_gs(ret->gs);
 
@@ -194,29 +191,16 @@ out:
 
 
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
-			struct kernel_vm86_struct *info);
+static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus);
 
 SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-
-	return do_sys_vm86((struct vm86plus_struct __user *) v86, false, &info);
+	return do_sys_vm86((struct vm86plus_struct __user *) v86, false);
 }
 
 
 SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-
 	switch (cmd) {
 	case VM86_REQUEST_IRQ:
 	case VM86_FREE_IRQ:
@@ -234,16 +218,17 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 	}
 
 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
-	return do_sys_vm86((struct vm86plus_struct __user *) arg, true, &info);
+	return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
 }
 
 
-static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
-			struct kernel_vm86_struct *info)
+static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 {
 	struct tss_struct *tss;
 	struct task_struct *tsk = current;
 	struct vm86 *vm86 = tsk->thread.vm86;
+	struct kernel_vm86_regs vm86regs;
+	struct pt_regs *regs32 = current_pt_regs();
 	unsigned long err = 0;
 
 	if (!vm86) {
@@ -259,27 +244,27 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 		       sizeof(struct vm86plus_struct)))
 		return -EFAULT;
 
-	memset(info, 0, sizeof(*info));
+	memset(&vm86regs, 0, sizeof(vm86regs));
 	get_user_try {
 		unsigned short seg;
-		get_user_ex(info->regs.pt.bx, &v86->regs.ebx);
-		get_user_ex(info->regs.pt.cx, &v86->regs.ecx);
-		get_user_ex(info->regs.pt.dx, &v86->regs.edx);
-		get_user_ex(info->regs.pt.si, &v86->regs.esi);
-		get_user_ex(info->regs.pt.di, &v86->regs.edi);
-		get_user_ex(info->regs.pt.bp, &v86->regs.ebp);
-		get_user_ex(info->regs.pt.ax, &v86->regs.eax);
-		get_user_ex(info->regs.pt.ip, &v86->regs.eip);
+		get_user_ex(vm86regs.pt.bx, &v86->regs.ebx);
+		get_user_ex(vm86regs.pt.cx, &v86->regs.ecx);
+		get_user_ex(vm86regs.pt.dx, &v86->regs.edx);
+		get_user_ex(vm86regs.pt.si, &v86->regs.esi);
+		get_user_ex(vm86regs.pt.di, &v86->regs.edi);
+		get_user_ex(vm86regs.pt.bp, &v86->regs.ebp);
+		get_user_ex(vm86regs.pt.ax, &v86->regs.eax);
+		get_user_ex(vm86regs.pt.ip, &v86->regs.eip);
 		get_user_ex(seg, &v86->regs.cs);
-		info->regs.pt.cs = seg;
-		get_user_ex(info->regs.pt.flags, &v86->regs.eflags);
-		get_user_ex(info->regs.pt.sp, &v86->regs.esp);
+		vm86regs.pt.cs = seg;
+		get_user_ex(vm86regs.pt.flags, &v86->regs.eflags);
+		get_user_ex(vm86regs.pt.sp, &v86->regs.esp);
 		get_user_ex(seg, &v86->regs.ss);
-		info->regs.pt.ss = seg;
-		get_user_ex(info->regs.es, &v86->regs.es);
-		get_user_ex(info->regs.ds, &v86->regs.ds);
-		get_user_ex(info->regs.fs, &v86->regs.fs);
-		get_user_ex(info->regs.gs, &v86->regs.gs);
+		vm86regs.pt.ss = seg;
+		get_user_ex(vm86regs.es, &v86->regs.es);
+		get_user_ex(vm86regs.ds, &v86->regs.ds);
+		get_user_ex(vm86regs.fs, &v86->regs.fs);
+		get_user_ex(vm86regs.gs, &v86->regs.gs);
 
 		get_user_ex(vm86->flags, &v86->flags);
 		get_user_ex(vm86->screen_bitmap, &v86->screen_bitmap);
@@ -302,8 +287,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 	} else
 		memset(&vm86->vm86plus, 0,
 		       sizeof(struct vm86plus_info_struct));
-
-	info->regs32 = current_pt_regs();
+	vm86->regs32 = regs32;
 	vm86->vm86_info = v86;
 
 /*
@@ -311,12 +295,12 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
  * has set it up safely, so this makes sure interrupt etc flags are
  * inherited from protected mode.
  */
-	VEFLAGS = info->regs.pt.flags;
-	info->regs.pt.flags &= SAFE_MASK;
-	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
-	info->regs.pt.flags |= X86_VM_MASK;
+	VEFLAGS = vm86regs.pt.flags;
+	vm86regs.pt.flags &= SAFE_MASK;
+	vm86regs.pt.flags |= regs32->flags & ~SAFE_MASK;
+	vm86regs.pt.flags |= X86_VM_MASK;
 
-	info->regs.pt.orig_ax = info->regs32->orig_ax;
+	vm86regs.pt.orig_ax = regs32->orig_ax;
 
 	switch (vm86->cpu_type) {
 	case CPU_286:
@@ -336,12 +320,13 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 /*
  * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
  */
-	info->regs32->ax = VM86_SIGNAL;
+	regs32->ax = VM86_SIGNAL;
 	vm86->saved_sp0 = tsk->thread.sp0;
-	lazy_save_gs(info->regs32->gs);
+	lazy_save_gs(regs32->gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
+	/* Set new sp0 right below 32-bit regs */
+	tsk->thread.sp0 = (unsigned long) regs32;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
 	load_sp0(tss, &tsk->thread);
@@ -364,7 +349,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 #endif
 		"jmp resume_userspace"
 		: /* no outputs */
-		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
+		:"r" (&vm86regs), "r" (task_thread_info(tsk)), "r" (0));
 	unreachable();	/* we never return here */
 }
 
@@ -539,7 +524,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 {
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
-	struct kernel_vm86_info *vm86 = current->thread.vm86;
+	struct vm86 *vm86 = current->thread.vm86;
 
 	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
@@ -569,12 +554,14 @@ cannot_handle:
 
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
-	if (current->thread.vm86->vm86plus.is_vm86pus) {
+	struct vm86 *vm86 = current->thread.vm86;
+
+	if (vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
-			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
+			vm86->regs32->ax = VM86_TRAP + (trapno << 8);
 			/* setting this flag forces the code in entry_32.S to
 			   the path where we call save_v86_state() and change
-			   the stack pointer to KVM86->regs32 */
+			   the stack pointer to regs32 */
 			set_thread_flag(TIF_NOTIFY_RESUME);
 			return 0;
 		}
-- 
cgit v1.2.3-70-g09d2


From 5ed92a8ab71f8865ba07811429c988c72299b315 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:19 -0400
Subject: x86/vm86: Use the normal pt_regs area for vm86

Change to use the normal pt_regs area to enter and exit vm86
mode.  This is done by increasing the padding at the top of the
stack to make room for the extra vm86 segment slots in the IRET
frame.  It then saves the 32-bit regs in the off-stack vm86
data, and copies in the vm86 regs.  Exiting back to 32-bit mode
does the reverse.  This allows removing the hacks to jump
directly into the exit asm code due to having to change the
stack pointer.  Returning normally from the vm86 syscall and the
exception handlers allows things like ptrace and auditing to work properly.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-5-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S          |  24 +-------
 arch/x86/include/asm/thread_info.h |  11 ++--
 arch/x86/include/asm/vm86.h        |   6 +-
 arch/x86/kernel/signal.c           |   3 +
 arch/x86/kernel/vm86_32.c          | 110 +++++++++++++++----------------------
 5 files changed, 60 insertions(+), 94 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 21dc60a60b5f..f940e24acaf0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -525,34 +525,12 @@ work_resched:
 
 work_notifysig:					# deal with pending signals and
 						# notify-resume requests
-#ifdef CONFIG_VM86
-	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esp)
-	movl	%esp, %eax
-	jnz	work_notifysig_v86		# returning to kernel-space or
-						# vm86-space
-1:
-#else
-	movl	%esp, %eax
-#endif
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	movb	PT_CS(%esp), %bl
-	andb	$SEGMENT_RPL_MASK, %bl
-	cmpb	$USER_RPL, %bl
-	jb	resume_kernel
+	movl	%esp, %eax
 	xorl	%edx, %edx
 	call	do_notify_resume
 	jmp	resume_userspace
-
-#ifdef CONFIG_VM86
-	ALIGN
-work_notifysig_v86:
-	pushl	%ecx				# save ti_flags for do_notify_resume
-	call	save_v86_state			# %eax contains pt_regs pointer
-	popl	%ecx
-	movl	%eax, %esp
-	jmp	1b
-#endif
 END(work_pending)
 
 	# perform syscall exit tracing
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 225ee545e1a0..fdad5c244350 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -27,14 +27,17 @@
  * Without this offset, that can result in a page fault.  (We are
  * careful that, in this case, the value we read doesn't matter.)
  *
- * In vm86 mode, the hardware frame is much longer still, but we neither
- * access the extra members from NMI context, nor do we write such a
- * frame at sp0 at all.
+ * In vm86 mode, the hardware frame is much longer still, so add 16
+ * bytes to make room for the real-mode segments.
  *
  * x86_64 has a fixed-length stack frame.
  */
 #ifdef CONFIG_X86_32
-# define TOP_OF_KERNEL_STACK_PADDING 8
+# ifdef CONFIG_VM86
+#  define TOP_OF_KERNEL_STACK_PADDING 16
+# else
+#  define TOP_OF_KERNEL_STACK_PADDING 8
+# endif
 #else
 # define TOP_OF_KERNEL_STACK_PADDING 0
 #endif
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 226d6c157ebc..e45386eee17a 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -29,7 +29,7 @@ struct kernel_vm86_regs {
 
 struct vm86 {
 	struct vm86plus_struct __user *vm86_info;
-	struct pt_regs *regs32;
+	struct pt_regs regs32;
 	unsigned long v86flags;
 	unsigned long v86mask;
 	unsigned long saved_sp0;
@@ -46,7 +46,7 @@ struct vm86 {
 
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
 int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
+void save_v86_state(struct kernel_vm86_regs *, int);
 
 struct task_struct;
 void release_vm86_irqs(struct task_struct *);
@@ -69,6 +69,8 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
 	return 0;
 }
 
+static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
+
 #define free_vm86(t) do { } while(0)
 
 #endif /* CONFIG_VM86 */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 7e88cc782712..bfd736e80c89 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -635,6 +635,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	bool stepping, failed;
 	struct fpu *fpu = &current->thread.fpu;
 
+	if (v8086_mode(regs))
+		save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
+
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 696ef767f2be..ffe98eceda77 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -50,6 +50,7 @@
 #include <asm/io.h>
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
+#include <asm/traps.h>
 
 /*
  * Known problems:
@@ -87,10 +88,9 @@
 #define SAFE_MASK	(0xDD5)
 #define RETURN_MASK	(0xDFF)
 
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 {
 	struct tss_struct *tss;
-	struct pt_regs *ret;
 	struct task_struct *tsk = current;
 	struct vm86plus_struct __user *user;
 	struct vm86 *vm86 = current->thread.vm86;
@@ -149,11 +149,11 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	vm86->saved_sp0 = 0;
 	put_cpu();
 
-	ret = vm86->regs32;
+	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
 
-	lazy_load_gs(ret->gs);
+	lazy_load_gs(vm86->regs32.gs);
 
-	return ret;
+	regs->pt.ax = retval;
 }
 
 static void mark_screen_rdonly(struct mm_struct *mm)
@@ -228,7 +228,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	struct task_struct *tsk = current;
 	struct vm86 *vm86 = tsk->thread.vm86;
 	struct kernel_vm86_regs vm86regs;
-	struct pt_regs *regs32 = current_pt_regs();
+	struct pt_regs *regs = current_pt_regs();
 	unsigned long err = 0;
 
 	if (!vm86) {
@@ -287,7 +287,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	} else
 		memset(&vm86->vm86plus, 0,
 		       sizeof(struct vm86plus_info_struct));
-	vm86->regs32 = regs32;
+
+	memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
 	vm86->vm86_info = v86;
 
 /*
@@ -297,10 +298,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
  */
 	VEFLAGS = vm86regs.pt.flags;
 	vm86regs.pt.flags &= SAFE_MASK;
-	vm86regs.pt.flags |= regs32->flags & ~SAFE_MASK;
+	vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
 	vm86regs.pt.flags |= X86_VM_MASK;
 
-	vm86regs.pt.orig_ax = regs32->orig_ax;
+	vm86regs.pt.orig_ax = regs->orig_ax;
 
 	switch (vm86->cpu_type) {
 	case CPU_286:
@@ -318,15 +319,14 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	}
 
 /*
- * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
+ * Save old state
  */
-	regs32->ax = VM86_SIGNAL;
 	vm86->saved_sp0 = tsk->thread.sp0;
-	lazy_save_gs(regs32->gs);
+	lazy_save_gs(vm86->regs32.gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	/* Set new sp0 right below 32-bit regs */
-	tsk->thread.sp0 = (unsigned long) regs32;
+	/* make room for real-mode segments */
+	tsk->thread.sp0 += 16;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
 	load_sp0(tss, &tsk->thread);
@@ -335,41 +335,14 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call __audit_syscall_exit since we do not exit via the normal paths */
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context))
-		__audit_syscall_exit(1, 0);
-#endif
-
-	__asm__ __volatile__(
-		"movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-#ifdef CONFIG_X86_32_LAZY_GS
-		"mov  %2, %%gs\n\t"
-#endif
-		"jmp resume_userspace"
-		: /* no outputs */
-		:"r" (&vm86regs), "r" (task_thread_info(tsk)), "r" (0));
-	unreachable();	/* we never return here */
-}
-
-static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-{
-	struct pt_regs *regs32;
-
-	regs32 = save_v86_state(regs16);
-	regs32->ax = retval;
-	__asm__ __volatile__("movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-		"jmp resume_userspace"
-		: : "r" (regs32), "r" (current_thread_info()));
+	memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+	force_iret();
+	return regs->ax;
 }
 
 static inline void set_IF(struct kernel_vm86_regs *regs)
 {
 	VEFLAGS |= X86_EFLAGS_VIF;
-	if (VEFLAGS & X86_EFLAGS_VIP)
-		return_to_32bit(regs, VM86_STI);
 }
 
 static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -549,7 +522,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	return;
 
 cannot_handle:
-	return_to_32bit(regs, VM86_INTx + (i << 8));
+	save_v86_state(regs, VM86_INTx + (i << 8));
 }
 
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
@@ -558,11 +531,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 
 	if (vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
-			vm86->regs32->ax = VM86_TRAP + (trapno << 8);
-			/* setting this flag forces the code in entry_32.S to
-			   the path where we call save_v86_state() and change
-			   the stack pointer to regs32 */
-			set_thread_flag(TIF_NOTIFY_RESUME);
+			save_v86_state(regs, VM86_TRAP + (trapno << 8));
 			return 0;
 		}
 		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -588,12 +557,6 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 #define CHECK_IF_IN_TRAP \
 	if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
 		newflags |= X86_EFLAGS_TF
-#define VM86_FAULT_RETURN do { \
-	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
-		return_to_32bit(regs, VM86_PICRETURN); \
-	if (orig_flags & X86_EFLAGS_TF) \
-		handle_vm86_trap(regs, 0, 1); \
-	return; } while (0)
 
 	orig_flags = *(unsigned short *)&regs->pt.flags;
 
@@ -632,7 +595,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 			SP(regs) -= 2;
 		}
 		IP(regs) = ip;
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 	/* popf */
 	case 0x9d:
@@ -652,7 +615,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		else
 			set_vflags_short(newflags, regs);
 
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 
 	/* int xx */
@@ -660,8 +623,10 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		int intno = popb(csp, ip, simulate_sigsegv);
 		IP(regs) = ip;
 		if (vmpi->vm86dbg_active) {
-			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3])
-				return_to_32bit(regs, VM86_INTx + (intno << 8));
+			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+				save_v86_state(regs, VM86_INTx + (intno << 8));
+				return;
+			}
 		}
 		do_int(regs, intno, ssp, sp);
 		return;
@@ -692,14 +657,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		} else {
 			set_vflags_short(newflags, regs);
 		}
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 
 	/* cli */
 	case 0xfa:
 		IP(regs) = ip;
 		clear_IF(regs);
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 	/* sti */
 	/*
@@ -711,12 +676,27 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	case 0xfb:
 		IP(regs) = ip;
 		set_IF(regs);
-		VM86_FAULT_RETURN;
+		goto check_vip;
 
 	default:
-		return_to_32bit(regs, VM86_UNKNOWN);
+		save_v86_state(regs, VM86_UNKNOWN);
+	}
+
+	return;
+
+check_vip:
+	if (VEFLAGS & X86_EFLAGS_VIP) {
+		save_v86_state(regs, VM86_STI);
+		return;
 	}
 
+vm86_fault_return:
+	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+		save_v86_state(regs, VM86_PICRETURN);
+		return;
+	}
+	if (orig_flags & X86_EFLAGS_TF)
+		handle_vm86_trap(regs, 0, X86_TRAP_DB);
 	return;
 
 simulate_sigsegv:
@@ -730,7 +710,7 @@ simulate_sigsegv:
 	 *        should be a mixture of the two, but how do we
 	 *        get the information? [KD]
 	 */
-	return_to_32bit(regs, VM86_UNKNOWN);
+	save_v86_state(regs, VM86_UNKNOWN);
 }
 
 /* ---------------- vm86 special IRQ passing stuff ----------------- */
-- 
cgit v1.2.3-70-g09d2


From ba3e127ec105e790eeec4034d9769e018e4a1b54 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:21 -0400
Subject: x86/vm86: Clean up vm86.h includes

vm86.h was being implicitly included in alot of places via
processor.h, which in turn got it from math_emu.h.  Break that
chain and explicitly include vm86.h in all files that need it.
Also remove unused vm86 field from math_emu_info.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-7-git-send-email-brgerst@gmail.com
[ Fixed build failure. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/math_emu.h | 6 +-----
 arch/x86/include/asm/syscalls.h | 1 +
 arch/x86/kernel/process_32.c    | 1 +
 arch/x86/kernel/signal.c        | 1 +
 arch/x86/kernel/traps.c         | 1 +
 arch/x86/kernel/vm86_32.c       | 1 +
 arch/x86/math-emu/get_address.c | 1 +
 arch/x86/mm/fault.c             | 1 +
 drivers/scsi/dpt_i2o.c          | 3 +++
 9 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
index 031f6266f425..0d9b14f60d2c 100644
--- a/arch/x86/include/asm/math_emu.h
+++ b/arch/x86/include/asm/math_emu.h
@@ -2,7 +2,6 @@
 #define _ASM_X86_MATH_EMU_H
 
 #include <asm/ptrace.h>
-#include <asm/vm86.h>
 
 /* This structure matches the layout of the data saved to the stack
    following a device-not-present interrupt, part of it saved
@@ -10,9 +9,6 @@
    */
 struct math_emu_info {
 	long ___orig_eip;
-	union {
-		struct pt_regs *regs;
-		struct kernel_vm86_regs *vm86;
-	};
+	struct pt_regs *regs;
 };
 #endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 592a6a672e07..91dfcafe27a6 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);
 asmlinkage unsigned long sys_sigreturn(void);
 
 /* kernel/vm86_32.c */
+struct vm86_struct;
 asmlinkage long sys_vm86old(struct vm86_struct __user *);
 asmlinkage long sys_vm86(unsigned long, unsigned long);
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f73c962fe636..c13df2c735f8 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,6 +53,7 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
+#include <asm/vm86.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index bfd736e80c89..07eb84407036 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -31,6 +31,7 @@
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
+#include <asm/vm86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8e65d8a9b8db..86a82eafb96f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,6 +62,7 @@
 #include <asm/fpu/xstate.h>
 #include <asm/trace/mpx.h>
 #include <asm/mpx.h>
+#include <asm/vm86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index ffe98eceda77..0de1f66ad001 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -51,6 +51,7 @@
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
 #include <asm/traps.h>
+#include <asm/vm86.h>
 
 /*
  * Known problems:
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 6ef5e99380f9..a2eefb121a5f 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -21,6 +21,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/desc.h>
+#include <asm/vm86.h>
 
 #include "fpu_system.h"
 #include "exception.h"
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 34a368d2d533..eef44d9a3f77 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
+#include <asm/vm86.h>			/* struct vm86			*/
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index f35ed53adaac..d4cda5e9600e 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -1924,6 +1924,9 @@ static void adpt_alpha_info(sysInfo_S* si)
 #endif
 
 #if defined __i386__
+
+#include <uapi/asm/vm86.h>
+
 static void adpt_i386_info(sysInfo_S* si)
 {
 	// This is all the info we need for now
-- 
cgit v1.2.3-70-g09d2


From 1342635638cba9b7c8eac776da5e54390d14d313 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:22 -0400
Subject: x86/vm86: Rename vm86->vm86_info to user_vm86

Make it clearer that this is the pointer to the userspace vm86
state area.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-8-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/vm86.h |  2 +-
 arch/x86/kernel/vm86_32.c   | 70 +++++++++++++++++++++++----------------------
 2 files changed, 37 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index b0631965b935..c93ae7379550 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -28,7 +28,7 @@ struct kernel_vm86_regs {
 };
 
 struct vm86 {
-	struct vm86plus_struct __user *vm86_info;
+	struct vm86plus_struct __user *user_vm86;
 	struct pt_regs regs32;
 	unsigned long v86flags;
 	unsigned long v86mask;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 0de1f66ad001..52aa33e2baca 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -104,17 +104,17 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 	 */
 	local_irq_enable();
 
-	if (!vm86 || !vm86->vm86_info) {
-		pr_alert("no vm86_info: BAD\n");
+	if (!vm86 || !vm86->user_vm86) {
+		pr_alert("no user_vm86: BAD\n");
 		do_exit(SIGSEGV);
 	}
 	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask);
-	user = vm86->vm86_info;
+	user = vm86->user_vm86;
 
 	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
 		       sizeof(struct vm86plus_struct) :
 		       sizeof(struct vm86_struct))) {
-		pr_alert("could not access userspace vm86_info\n");
+		pr_alert("could not access userspace vm86 info\n");
 		do_exit(SIGSEGV);
 	}
 
@@ -139,7 +139,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
 	} put_user_catch(err);
 	if (err) {
-		pr_alert("could not access userspace vm86_info\n");
+		pr_alert("could not access userspace vm86 info\n");
 		do_exit(SIGSEGV);
 	}
 
@@ -192,11 +192,11 @@ out:
 
 
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
 
-SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
 {
-	return do_sys_vm86((struct vm86plus_struct __user *) v86, false);
+	return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
 }
 
 
@@ -223,7 +223,7 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 }
 
 
-static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 {
 	struct tss_struct *tss;
 	struct task_struct *tsk = current;
@@ -240,7 +240,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	if (vm86->saved_sp0)
 		return -EPERM;
 
-	if (!access_ok(VERIFY_READ, v86, plus ?
+	if (!access_ok(VERIFY_READ, user_vm86, plus ?
 		       sizeof(struct vm86_struct) :
 		       sizeof(struct vm86plus_struct)))
 		return -EFAULT;
@@ -248,40 +248,42 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	memset(&vm86regs, 0, sizeof(vm86regs));
 	get_user_try {
 		unsigned short seg;
-		get_user_ex(vm86regs.pt.bx, &v86->regs.ebx);
-		get_user_ex(vm86regs.pt.cx, &v86->regs.ecx);
-		get_user_ex(vm86regs.pt.dx, &v86->regs.edx);
-		get_user_ex(vm86regs.pt.si, &v86->regs.esi);
-		get_user_ex(vm86regs.pt.di, &v86->regs.edi);
-		get_user_ex(vm86regs.pt.bp, &v86->regs.ebp);
-		get_user_ex(vm86regs.pt.ax, &v86->regs.eax);
-		get_user_ex(vm86regs.pt.ip, &v86->regs.eip);
-		get_user_ex(seg, &v86->regs.cs);
+		get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
+		get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
+		get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
+		get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
+		get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
+		get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
+		get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
+		get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
+		get_user_ex(seg, &user_vm86->regs.cs);
 		vm86regs.pt.cs = seg;
-		get_user_ex(vm86regs.pt.flags, &v86->regs.eflags);
-		get_user_ex(vm86regs.pt.sp, &v86->regs.esp);
-		get_user_ex(seg, &v86->regs.ss);
+		get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
+		get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
+		get_user_ex(seg, &user_vm86->regs.ss);
 		vm86regs.pt.ss = seg;
-		get_user_ex(vm86regs.es, &v86->regs.es);
-		get_user_ex(vm86regs.ds, &v86->regs.ds);
-		get_user_ex(vm86regs.fs, &v86->regs.fs);
-		get_user_ex(vm86regs.gs, &v86->regs.gs);
-
-		get_user_ex(vm86->flags, &v86->flags);
-		get_user_ex(vm86->screen_bitmap, &v86->screen_bitmap);
-		get_user_ex(vm86->cpu_type, &v86->cpu_type);
+		get_user_ex(vm86regs.es, &user_vm86->regs.es);
+		get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
+		get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
+		get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
+
+		get_user_ex(vm86->flags, &user_vm86->flags);
+		get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
+		get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
 	} get_user_catch(err);
 	if (err)
 		return err;
 
-	if (copy_from_user(&vm86->int_revectored, &v86->int_revectored,
+	if (copy_from_user(&vm86->int_revectored,
+			   &user_vm86->int_revectored,
 			   sizeof(struct revectored_struct)))
 		return -EFAULT;
-	if (copy_from_user(&vm86->int21_revectored, &v86->int21_revectored,
+	if (copy_from_user(&vm86->int21_revectored,
+			   &user_vm86->int21_revectored,
 			   sizeof(struct revectored_struct)))
 		return -EFAULT;
 	if (plus) {
-		if (copy_from_user(&vm86->vm86plus, &v86->vm86plus,
+		if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
 				   sizeof(struct vm86plus_info_struct)))
 			return -EFAULT;
 		vm86->vm86plus.is_vm86pus = 1;
@@ -290,7 +292,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 		       sizeof(struct vm86plus_info_struct));
 
 	memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
-	vm86->vm86_info = v86;
+	vm86->user_vm86 = user_vm86;
 
 /*
  * The flags register is also special: we cannot trust that the user
-- 
cgit v1.2.3-70-g09d2


From decd275e62d5eef4b947fab89652fa6afdadf2f2 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:23 -0400
Subject: x86/vm86: Rename vm86->v86flags and v86mask

Rename v86flags to veflags, and v86mask to veflags_mask.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-9-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/vm86.h |  4 ++--
 arch/x86/kernel/vm86_32.c   | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index c93ae7379550..1e491f3af317 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -30,8 +30,8 @@ struct kernel_vm86_regs {
 struct vm86 {
 	struct vm86plus_struct __user *user_vm86;
 	struct pt_regs regs32;
-	unsigned long v86flags;
-	unsigned long v86mask;
+	unsigned long veflags;
+	unsigned long veflags_mask;
 	unsigned long saved_sp0;
 
 	unsigned long flags;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 52aa33e2baca..abd8b856bd2b 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -80,8 +80,8 @@
 /*
  * virtual flags (16 and 32-bit versions)
  */
-#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->v86flags))
-#define VEFLAGS	(current->thread.vm86->v86flags)
+#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS	(current->thread.vm86->veflags)
 
 #define set_flags(X, new, mask) \
 ((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -108,7 +108,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 		pr_alert("no user_vm86: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask);
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
 	user = vm86->user_vm86;
 
 	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
@@ -308,16 +308,16 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 
 	switch (vm86->cpu_type) {
 	case CPU_286:
-		vm86->v86mask = 0;
+		vm86->veflags_mask = 0;
 		break;
 	case CPU_386:
-		vm86->v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	case CPU_486:
-		vm86->v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	default:
-		vm86->v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	}
 
@@ -377,7 +377,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
 
 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VEFLAGS, flags, current->thread.vm86->v86mask);
+	set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -387,7 +387,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
 
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VFLAGS, flags, current->thread.vm86->v86mask);
+	set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -402,7 +402,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
 	if (VEFLAGS & X86_EFLAGS_VIF)
 		flags |= X86_EFLAGS_IF;
 	flags |= X86_EFLAGS_IOPL;
-	return flags | (VEFLAGS & current->thread.vm86->v86mask);
+	return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
 }
 
 static inline int is_revectored(int nr, struct revectored_struct *bitmap)
-- 
cgit v1.2.3-70-g09d2


From 76b235c6bcb16062d663e2ee96db0b69f2e6bc14 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Jul 2015 14:45:44 +0200
Subject: jump_label: Rename JUMP_LABEL_{EN,DIS}ABLE to JUMP_LABEL_{JMP,NOP}

Since we've already stepped away from ENABLE is a JMP and DISABLE is a
NOP with the branch_default bits, and are going to make it even worse,
rename it to make it all clearer.

This way we don't mix multiple levels of logic attributes, but have a
plain 'physical' name for what the current instruction patching status
of a jump label is.

This is a first step in removing the naming confusion that has led to
a stream of avoidable bugs such as:

  a833581e372a ("x86, perf: Fix static_key bug in load_mm_cr4()")

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
[ Beefed up the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/jump_label.c     |  2 +-
 arch/arm64/kernel/jump_label.c   |  2 +-
 arch/mips/kernel/jump_label.c    |  2 +-
 arch/powerpc/kernel/jump_label.c |  2 +-
 arch/s390/kernel/jump_label.c    |  2 +-
 arch/sparc/kernel/jump_label.c   |  2 +-
 arch/x86/kernel/jump_label.c     |  2 +-
 include/linux/jump_label.h       |  4 ++--
 kernel/jump_label.c              | 18 +++++++++---------
 9 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index e39cbf488cfe..845a5dd9c42b 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c
@@ -12,7 +12,7 @@ static void __arch_jump_label_transform(struct jump_entry *entry,
 	void *addr = (void *)entry->code;
 	unsigned int insn;
 
-	if (type == JUMP_LABEL_ENABLE)
+	if (type == JUMP_LABEL_JMP)
 		insn = arm_gen_branch(entry->code, entry->target);
 	else
 		insn = arm_gen_nop();
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index 4f1fec7a46db..c2dd1ad3e648 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -28,7 +28,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
 	void *addr = (void *)entry->code;
 	u32 insn;
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		insn = aarch64_insn_gen_branch_imm(entry->code,
 						   entry->target,
 						   AARCH64_INSN_BRANCH_NOLINK);
diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c
index dda800e9e731..3e586daa3a32 100644
--- a/arch/mips/kernel/jump_label.c
+++ b/arch/mips/kernel/jump_label.c
@@ -51,7 +51,7 @@ void arch_jump_label_transform(struct jump_entry *e,
 	/* Target must have the right alignment and ISA must be preserved. */
 	BUG_ON((e->target & J_ALIGN_MASK) != J_ISA_BIT);
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		insn.j_format.opcode = J_ISA_BIT ? mm_j32_op : j_op;
 		insn.j_format.target = e->target >> J_RANGE_SHIFT;
 	} else {
diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c
index a1ed8a8c7cb4..6472472093d0 100644
--- a/arch/powerpc/kernel/jump_label.c
+++ b/arch/powerpc/kernel/jump_label.c
@@ -17,7 +17,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
 {
 	u32 *addr = (u32 *)(unsigned long)entry->code;
 
-	if (type == JUMP_LABEL_ENABLE)
+	if (type == JUMP_LABEL_JMP)
 		patch_branch(addr, entry->target, 0);
 	else
 		patch_instruction(addr, PPC_INST_NOP);
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index a90299600483..a83d2248fea9 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -64,7 +64,7 @@ static void __jump_label_transform(struct jump_entry *entry,
 {
 	struct insn old, new;
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		jump_label_make_nop(entry, &old);
 		jump_label_make_branch(entry, &new);
 	} else {
diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c
index 48565c11e82a..59bbeff55024 100644
--- a/arch/sparc/kernel/jump_label.c
+++ b/arch/sparc/kernel/jump_label.c
@@ -16,7 +16,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
 	u32 val;
 	u32 *insn = (u32 *) (unsigned long) entry->code;
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		s32 off = (s32)entry->target - (s32)entry->code;
 
 #ifdef CONFIG_SPARC64
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 26d5a55a2736..e565e0e4d216 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -45,7 +45,7 @@ static void __jump_label_transform(struct jump_entry *entry,
 	const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
 	const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		if (init) {
 			/*
 			 * Jump label is enabled for the first time.
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index f4de473f226b..6a8b4fe10ad8 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -86,8 +86,8 @@ struct static_key {
 #ifndef __ASSEMBLY__
 
 enum jump_label_type {
-	JUMP_LABEL_DISABLE = 0,
-	JUMP_LABEL_ENABLE,
+	JUMP_LABEL_NOP = 0,
+	JUMP_LABEL_JMP,
 };
 
 struct module;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 52ebaca1b9fc..96d8945c8bf3 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -65,9 +65,9 @@ void static_key_slow_inc(struct static_key *key)
 	jump_label_lock();
 	if (atomic_read(&key->enabled) == 0) {
 		if (!jump_label_get_branch_default(key))
-			jump_label_update(key, JUMP_LABEL_ENABLE);
+			jump_label_update(key, JUMP_LABEL_JMP);
 		else
-			jump_label_update(key, JUMP_LABEL_DISABLE);
+			jump_label_update(key, JUMP_LABEL_NOP);
 	}
 	atomic_inc(&key->enabled);
 	jump_label_unlock();
@@ -88,9 +88,9 @@ static void __static_key_slow_dec(struct static_key *key,
 		schedule_delayed_work(work, rate_limit);
 	} else {
 		if (!jump_label_get_branch_default(key))
-			jump_label_update(key, JUMP_LABEL_DISABLE);
+			jump_label_update(key, JUMP_LABEL_NOP);
 		else
-			jump_label_update(key, JUMP_LABEL_ENABLE);
+			jump_label_update(key, JUMP_LABEL_JMP);
 	}
 	jump_label_unlock();
 }
@@ -184,9 +184,9 @@ static enum jump_label_type jump_label_type(struct static_key *key)
 	bool state = static_key_enabled(key);
 
 	if ((!true_branch && state) || (true_branch && !state))
-		return JUMP_LABEL_ENABLE;
+		return JUMP_LABEL_JMP;
 
-	return JUMP_LABEL_DISABLE;
+	return JUMP_LABEL_NOP;
 }
 
 void __init jump_label_init(void)
@@ -276,7 +276,7 @@ void jump_label_apply_nops(struct module *mod)
 		return;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+		arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
 	}
 }
 
@@ -318,8 +318,8 @@ static int jump_label_add_module(struct module *mod)
 		jlm->next = key->next;
 		key->next = jlm;
 
-		if (jump_label_type(key) == JUMP_LABEL_ENABLE)
-			__jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+		if (jump_label_type(key) == JUMP_LABEL_JMP)
+			__jump_label_update(key, iter, iter_stop, JUMP_LABEL_JMP);
 	}
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 3bbfafb77a06327fa1bc9f19bc55b5c558475091 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Jul 2015 16:34:32 +0200
Subject: x86, tsc, locking/static_keys: Employ static_branch_likely()

Because of the static_key restrictions we had to take an unconditional
jump for the most likely case, causing $I bloat.

Rewrite to use the new primitives.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1bb8bab1b3cb..b9cfd462f7e7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -38,7 +38,7 @@ static int __read_mostly tsc_unstable;
    erroneous rdtsc usage on !cpu_has_tsc processors */
 static int __read_mostly tsc_disabled = -1;
 
-static struct static_key __use_tsc = STATIC_KEY_INIT;
+static DEFINE_STATIC_KEY_FALSE(__use_tsc);
 
 int tsc_clocksource_reliable;
 
@@ -274,7 +274,12 @@ done:
  */
 u64 native_sched_clock(void)
 {
-	u64 tsc_now;
+	if (static_branch_likely(&__use_tsc)) {
+		u64 tsc_now = rdtsc();
+
+		/* return the value in ns */
+		return cycles_2_ns(tsc_now);
+	}
 
 	/*
 	 * Fall back to jiffies if there's no TSC available:
@@ -284,16 +289,9 @@ u64 native_sched_clock(void)
 	 *   very important for it to be as fast as the platform
 	 *   can achieve it. )
 	 */
-	if (!static_key_false(&__use_tsc)) {
-		/* No locking but a rare wrong value is not a big deal: */
-		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
-	}
-
-	/* read the Time Stamp Counter: */
-	tsc_now = rdtsc();
 
-	/* return the value in ns */
-	return cycles_2_ns(tsc_now);
+	/* No locking but a rare wrong value is not a big deal: */
+	return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 }
 
 /* We need to define a real function for sched_clock, to override the
@@ -1204,7 +1202,7 @@ void __init tsc_init(void)
 	/* now allow native_sched_clock() to use rdtsc */
 
 	tsc_disabled = 0;
-	static_key_slow_inc(&__use_tsc);
+	static_branch_enable(&__use_tsc);
 
 	if (!no_sched_irq_time)
 		enable_sched_clock_irqtime();
-- 
cgit v1.2.3-70-g09d2


From 3a2a7797326a4bc59b7ff0cc92c8b274abf21892 Mon Sep 17 00:00:00 2001
From: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Date: Tue, 26 May 2015 11:47:39 -0700
Subject: perf/x86/intel/rapl: Add support for Knights Landing (KNL)

Knights Landing DRAM RAPL supports PKG and DRAM RAPL domains.
DRAM RAPL has a different fixed energy unit (2^-16J) similar to
that of HSW.

Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Stephane Eranian <eranian@google.com>
Acked-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jacob Pan Jun <jacob.jun.pan@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nikhil Rao <nikhil.rao@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/aa63b4a3af3160152fea1a10c807f4200527280c.1432665809.git.dasaratharaman.chandramouli@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 5cbd4e64feb5..81431c0f0614 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -86,6 +86,10 @@ static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
 			 1<<RAPL_IDX_RAM_NRG_STAT|\
 			 1<<RAPL_IDX_PP1_NRG_STAT)
 
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL	(1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT)
+
 /*
  * event code: LSB 8 bits, passed in attr->config
  * any other bit is reserved
@@ -486,6 +490,18 @@ static struct attribute *rapl_events_hsw_attr[] = {
 	NULL,
 };
 
+static struct attribute *rapl_events_knl_attr[] = {
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
 static struct attribute_group rapl_pmu_events_group = {
 	.name = "events",
 	.attrs = NULL, /* patched at runtime */
@@ -730,6 +746,10 @@ static int __init rapl_pmu_init(void)
 		rapl_cntr_mask = RAPL_IDX_SRV;
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
+	case 87: /* Knights Landing */
+		rapl_add_quirk(rapl_hsw_server_quirk);
+		rapl_cntr_mask = RAPL_IDX_KNL;
+		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
 
 	default:
 		/* unsupported */
-- 
cgit v1.2.3-70-g09d2


From 070a7cdfa4a0a799235d79e58e7b0b2d94dff190 Mon Sep 17 00:00:00 2001
From: Vaishali Thakkar <vthakkar1994@gmail.com>
Date: Fri, 17 Jul 2015 10:57:59 +0530
Subject: perf/x86/intel/uncore: Remove use of macro DEFINE_PCI_DEVICE_TABLE()

The DEFINE_PCI_DEVICE_TABLE() macro is deprecated. Use
'struct pci_device_id' instead of DEFINE_PCI_DEVICE_TABLE(),
with the goal of getting rid of this macro completely.

This Coccinelle semantic patch performs this transformation:

@@
identifier a;
declarer name DEFINE_PCI_DEVICE_TABLE;
initializer i;
@@
- DEFINE_PCI_DEVICE_TABLE(a)
+ const struct pci_device_id a[] = i;

Signed-off-by: Vaishali Thakkar <vthakkar1994@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150717052759.GA6265@vaishali-Ideapad-Z570
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 6d6e85dd5849..76a3feb15ab1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -2215,7 +2215,7 @@ static struct intel_uncore_type *hswep_pci_uncores[] = {
 	NULL,
 };
 
-static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = {
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
 	{ /* Home Agent 0 */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
 		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
-- 
cgit v1.2.3-70-g09d2


From e3a13192d86048e91a2a1d534abe5ac2397d9113 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 14 Jun 2015 22:57:40 -0700
Subject: perf/x86/intel/uncore: Add support for ARB uncore PMU on
 Sandy/IvyBridge

Add a new "ARB" uncore PMU that is used to monitor the uncore queue
arbiter. This is useful to measure uncore queue occupancy and similar
statistics. The registers all have the same format as the
existing CBOX PMU.

Also move the event constraints from the CBOX to ARB. The 0x80+
events are ARB events and cannot be scheduled on a CBOX PMU.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1434347862-28490-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78c7012..f78574b3cb55 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -45,6 +45,11 @@
 #define SNB_UNC_CBO_0_PER_CTR0                  0x706
 #define SNB_UNC_CBO_MSR_OFFSET                  0x10
 
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0			0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0			0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET			0x10
+
 /* NHM global control register */
 #define NHM_UNC_PERF_GLOBAL_CTL                 0x391
 #define NHM_UNC_FIXED_CTR                       0x394
@@ -115,7 +120,7 @@ static struct intel_uncore_ops snb_uncore_msr_ops = {
 	.read_counter	= uncore_msr_read_counter,
 };
 
-static struct event_constraint snb_uncore_cbox_constraints[] = {
+static struct event_constraint snb_uncore_arb_constraints[] = {
 	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
 	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
 	EVENT_CONSTRAINT_END
@@ -134,14 +139,28 @@ static struct intel_uncore_type snb_uncore_cbox = {
 	.single_fixed	= 1,
 	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
 	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
-	.constraints	= snb_uncore_cbox_constraints,
 	.ops		= &snb_uncore_msr_ops,
 	.format_group	= &snb_uncore_format_group,
 	.event_descs	= snb_uncore_events,
 };
 
+static struct intel_uncore_type snb_uncore_arb = {
+	.name		= "arb",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.perf_ctr	= SNB_UNC_ARB_PER_CTR0,
+	.event_ctl	= SNB_UNC_ARB_PERFEVTSEL0,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= SNB_UNC_ARB_MSR_OFFSET,
+	.constraints	= snb_uncore_arb_constraints,
+	.ops		= &snb_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+};
+
 static struct intel_uncore_type *snb_msr_uncores[] = {
 	&snb_uncore_cbox,
+	&snb_uncore_arb,
 	NULL,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 3a999587b4a1815cf4dadddf6b5aad470f048239 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 14 Jun 2015 22:57:41 -0700
Subject: perf/x86/intel/uncore: Use Sandy Bridge client PMU on
 Haswell/Broadwell

Haswell and Broadwell have the same uncore CBOX/ARB PMU as Sandy Bridge.
Add the respective model numbers to enable the SNB uncore PMU.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1434347862-28490-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38c921b..c2af96716d62 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -1209,6 +1209,11 @@ static int __init uncore_cpu_init(void)
 		break;
 	case 42: /* Sandy Bridge */
 	case 58: /* Ivy Bridge */
+	case 60: /* Haswell */
+	case 69: /* Haswell */
+	case 70: /* Haswell */
+	case 61: /* Broadwell */
+	case 71: /* Broadwell */
 		snb_uncore_cpu_init();
 		break;
 	case 45: /* Sandy Bridge-EP */
-- 
cgit v1.2.3-70-g09d2


From e9b3bd379c283577e102529bfb22484238be7c91 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 24 Jun 2015 13:05:48 +0300
Subject: perf/x86/intel/bts: Drop redundant declarations

Both intel_pmu_enable_bts() and intel_pmu_disable_bts() are in perf_event.h
header file, no need to have them declared again in the driver.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1435140349-32588-2-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_bts.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index 43dd672d788b..54690e885759 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -62,9 +62,6 @@ struct bts_buffer {
 
 struct pmu bts_pmu;
 
-void intel_pmu_enable_bts(u64 config);
-void intel_pmu_disable_bts(void);
-
 static size_t buf_size(struct page *page)
 {
 	return 1 << (PAGE_SHIFT + page_private(page));
-- 
cgit v1.2.3-70-g09d2


From c749b3e96398fcc39286267b72fb8b85c0f757ea Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 24 Jun 2015 13:05:49 +0300
Subject: perf/x86/intel/lbr: Kill off intel_pmu_needs_lbr_smpl for good

The x86_lbr_exclusive commit (4807034248be "perf/x86: Mark Intel PT and
LBR/BTS as mutually exclusive") mistakenly moved intel_pmu_needs_lbr_smpl()
to perf_event.h, while another commit (a46a2300019 "perf: Simplify the
branch stack check") removed it in favor of needs_branch_stack().

This patch gets rid of intel_pmu_needs_lbr_smpl() for good.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1435140349-32588-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3e7fd27dfe20..3474cf26b6c4 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -808,20 +808,6 @@ static inline int amd_pmu_init(void)
 
 #ifdef CONFIG_CPU_SUP_INTEL
 
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
-	/* user explicitly requested branch sampling */
-	if (has_branch_stack(event))
-		return true;
-
-	/* implicit branch sampling to correct PEBS skid */
-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
-	    x86_pmu.intel_cap.pebs_format < 2)
-		return true;
-
-	return false;
-}
-
 static inline bool intel_pmu_has_bts(struct perf_event *event)
 {
 	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-- 
cgit v1.2.3-70-g09d2


From ae3f011fc25104a218caf4448b1d47ef1c9b3a42 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Wed, 24 Jun 2015 11:23:35 -0700
Subject: perf/x86/intel: Fix SLM MSR_OFFCORE_RSP1 valid_mask

AVG_LATENCY(bit 38) is only available on MSR_OFFCORE_RSP0.
So the bit should be removed from RSP1 valid_mask.

Since RSP0 and RSP1 may have different valid_mask, intel_alt_er should
validate the config on the alternate offcore reg before replacing it.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1435170215-5017-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index b9826a981fb2..71815cf3d2d3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1114,7 +1114,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly =
 {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1),
+	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
 	EVENT_EXTRA_END
 };
 
@@ -1699,18 +1699,22 @@ intel_bts_constraints(struct perf_event *event)
 	return NULL;
 }
 
-static int intel_alt_er(int idx)
+static int intel_alt_er(int idx, u64 config)
 {
+	int alt_idx;
 	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
 		return idx;
 
 	if (idx == EXTRA_REG_RSP_0)
-		return EXTRA_REG_RSP_1;
+		alt_idx = EXTRA_REG_RSP_1;
 
 	if (idx == EXTRA_REG_RSP_1)
-		return EXTRA_REG_RSP_0;
+		alt_idx = EXTRA_REG_RSP_0;
 
-	return idx;
+	if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+		return idx;
+
+	return alt_idx;
 }
 
 static void intel_fixup_er(struct perf_event *event, int idx)
@@ -1799,7 +1803,7 @@ again:
 		 */
 		c = NULL;
 	} else {
-		idx = intel_alt_er(idx);
+		idx = intel_alt_er(idx, reg->config);
 		if (idx != reg->idx) {
 			raw_spin_unlock_irqrestore(&era->lock, flags);
 			goto again;
-- 
cgit v1.2.3-70-g09d2


From e5779e8e12299f77c2421a707855d8d124171d85 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 30 Jul 2015 20:32:40 -0700
Subject: perf/x86/hw_breakpoints: Disallow kernel breakpoints unless
 kprobe-safe

Code on the kprobe blacklist doesn't want unexpected int3
exceptions. It probably doesn't want unexpected debug exceptions
either. Be safe: disallow breakpoints in nokprobes code.

On non-CONFIG_KPROBES kernels, there is no kprobe blacklist.  In
that case, disallow kernel breakpoints entirely.

It will be particularly important to keep hw breakpoints out of the
entry and NMI code once we move debug exceptions off the IST stack.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e14b152af99640448d895e3c2a8c2d5ee19a1325.1438312874.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/hw_breakpoint.c | 15 +++++++++++++++
 include/linux/kprobes.h         |  2 ++
 kernel/kprobes.c                |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 7114ba220fd4..78f3e90c5659 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,6 +32,7 @@
 #include <linux/irqflags.h>
 #include <linux/notifier.h>
 #include <linux/kallsyms.h>
+#include <linux/kprobes.h>
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
@@ -243,6 +244,20 @@ static int arch_build_bp_info(struct perf_event *bp)
 		info->type = X86_BREAKPOINT_RW;
 		break;
 	case HW_BREAKPOINT_X:
+		/*
+		 * We don't allow kernel breakpoints in places that are not
+		 * acceptable for kprobes.  On non-kprobes kernels, we don't
+		 * allow kernel breakpoints at all.
+		 */
+		if (bp->attr.bp_addr >= TASK_SIZE_MAX) {
+#ifdef CONFIG_KPROBES
+			if (within_kprobe_blacklist(bp->attr.bp_addr))
+				return -EINVAL;
+#else
+			return -EINVAL;
+#endif
+		}
+
 		info->type = X86_BREAKPOINT_EXECUTE;
 		/*
 		 * x86 inst breakpoints need to have a specific undefined len.
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1ab54754a86d..8f6849084248 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -267,6 +267,8 @@ extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
 
+extern bool within_kprobe_blacklist(unsigned long addr);
+
 struct kprobe_insn_cache {
 	struct mutex mutex;
 	void *(*alloc)(void);	/* allocate insn page */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c90e417bb963..d10ab6b9b5e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
 	       addr < (unsigned long)__kprobes_text_end;
 }
 
-static bool within_kprobe_blacklist(unsigned long addr)
+bool within_kprobe_blacklist(unsigned long addr)
 {
 	struct kprobe_blacklist_entry *ent;
 
-- 
cgit v1.2.3-70-g09d2


From ab513927ab449af00cc70b0269e15ee80dd537f9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 30 Jul 2015 20:32:41 -0700
Subject: perf/x86/hw_breakpoints: Improve range breakpoint validation

Range breakpoints will do the wrong thing if the address isn't
aligned.  While we're there, add comments about why it's safe for
instruction breakpoints.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ae25d14d61f2f43b78e0a247e469f3072df7e201.1438312874.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/hw_breakpoint.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 78f3e90c5659..6f345d302cf6 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -291,8 +291,18 @@ static int arch_build_bp_info(struct perf_event *bp)
 		break;
 #endif
 	default:
+		/* AMD range breakpoint */
 		if (!is_power_of_2(bp->attr.bp_len))
 			return -EINVAL;
+		if (bp->attr.bp_addr & (bp->attr.bp_len - 1))
+			return -EINVAL;
+		/*
+		 * It's impossible to use a range breakpoint to fake out
+		 * user vs kernel detection because bp_len - 1 can't
+		 * have the high bit set.  If we ever allow range instruction
+		 * breakpoints, then we'll have to check for kprobe-blacklisted
+		 * addresses anywhere in the range.
+		 */
 		if (!cpu_has_bpext)
 			return -EOPNOTSUPP;
 		info->mask = bp->attr.bp_len - 1;
-- 
cgit v1.2.3-70-g09d2


From 27747f8bc355a2808ca9e490ab6866acd85b4c16 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 30 Jul 2015 20:32:42 -0700
Subject: perf/x86/hw_breakpoints: Fix check for kernel-space breakpoints

The check looked wrong, although I think it was actually safe.  TASK_SIZE
is unnecessarily small for compat tasks, and it wasn't possible to make
a range breakpoint so large it started in user space and ended in kernel
space.

Nonetheless, let's fix up the check for the benefit of future
readers.  A breakpoint is in the kernel if either end is in the
kernel.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/136be387950e78f18cea60e9d1bef74465d0ee8f.1438312874.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/hw_breakpoint.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 6f345d302cf6..50a3fad5b89f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -180,7 +180,11 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
 	va = info->address;
 	len = bp->attr.bp_len;
 
-	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+	/*
+	 * We don't need to worry about va + len - 1 overflowing:
+	 * we already require that va is aligned to a multiple of len.
+	 */
+	return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
 }
 
 int arch_bp_generic_fields(int x86_len, int x86_type,
-- 
cgit v1.2.3-70-g09d2


From 9a6694cfa2390181dec936a17c0d9d21ef7b08d9 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 30 Jul 2015 16:48:24 +0300
Subject: perf/x86/intel/pt: Do not force sync packets on every schedule-in

Currently, the PT driver zeroes out the status register every time before
starting the event. However, all the writable bits are already taken care
of in pt_handle_status() function, except the new PacketByteCnt field,
which in new versions of PT contains the number of packet bytes written
since the last sync (PSB) packet. Zeroing it out before enabling PT forces
a sync packet to be written. This means that, with the existing code, a
sync packet (PSB and PSBEND, 18 bytes in total) will be generated every
time a PT event is scheduled in.

To avoid these unnecessary syncs and save a WRMSR in the fast path, this
patch changes the default behavior to not clear PacketByteCnt field, so
that the sync packets will be generated with the period specified as
"psb_period" attribute config field. This has little impact on the trace
data as the other packets that are normally sent within PSB+ (between PSB
and PSBEND) have their own generation scenarios which do not depend on the
sync packets.

One exception where we do need to force PSB like this when tracing starts,
so that the decoder has a clear sync point in the trace. For this purpose
we aready have hw::itrace_started flag, which we are currently using to
output PERF_RECORD_ITRACE_START. This patch moves setting itrace_started
from perf core to the pmu::start, where it should still be 0 on the very
first run.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1438264104-16189-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 7 +++++--
 kernel/events/core.c                      | 2 --
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 183de719628d..cc58ef8d9ada 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -191,6 +191,11 @@ static void pt_config(struct perf_event *event)
 {
 	u64 reg;
 
+	if (!event->hw.itrace_started) {
+		event->hw.itrace_started = 1;
+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+	}
+
 	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
 
 	if (!event->attr.exclude_kernel)
@@ -910,7 +915,6 @@ void intel_pt_interrupt(void)
 
 		pt_config_buffer(buf->cur->table, buf->cur_idx,
 				 buf->output_off);
-		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 		pt_config(event);
 	}
 }
@@ -934,7 +938,6 @@ static void pt_event_start(struct perf_event *event, int mode)
 
 	pt_config_buffer(buf->cur->table, buf->cur_idx,
 			 buf->output_off);
-	wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 	pt_config(event);
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a9796c8ff7e0..bdea12924b11 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6139,8 +6139,6 @@ static void perf_log_itrace_start(struct perf_event *event)
 	    event->hw.itrace_started)
 		return;
 
-	event->hw.itrace_started = 1;
-
 	rec.header.type	= PERF_RECORD_ITRACE_START;
 	rec.header.misc	= 0;
 	rec.header.size	= sizeof(rec);
-- 
cgit v1.2.3-70-g09d2


From b1bf72d6691cc33fc7763fc8ec77df42ca1a8702 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 30 Jul 2015 16:15:31 +0300
Subject: perf/x86/intel/pt: Add new timing packet enables

Intel PT chapter in the new Intel Architecture SDM adds several packets
corresponding enable bits and registers that control packet generation.
Also, additional bits in the Intel PT CPUID leaf were added to enumerate
presence and parameters of these new packets and features.

The packets and enables are:

  * CYC: cycle accurate mode, provides the number of cycles elapsed since
    previous CYC packet; its presence and available threshold values are
    enumerated via CPUID;

  * MTC: mini time counter packets, used for tracking TSC time between
    full TSC packets; its presence and available resolution options are
    enumerated via CPUID;

  * PSB packet period is now configurable, available period values are
    enumerated via CPUID.

This patch adds corresponding bit and register definitions, pmu driver
capabilities based on CPUID enumeration, new attribute format bits for
the new featurens and extends event configuration validation function
to take these into account.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1438262131-12725-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr-index.h          |  8 ++++
 arch/x86/kernel/cpu/intel_pt.h            |  6 +++
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 68 ++++++++++++++++++++++++++++++-
 3 files changed, 81 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 9ebc3d009373..c665d34f7285 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -80,13 +80,21 @@
 
 #define MSR_IA32_RTIT_CTL		0x00000570
 #define RTIT_CTL_TRACEEN		BIT(0)
+#define RTIT_CTL_CYCLEACC		BIT(1)
 #define RTIT_CTL_OS			BIT(2)
 #define RTIT_CTL_USR			BIT(3)
 #define RTIT_CTL_CR3EN			BIT(7)
 #define RTIT_CTL_TOPA			BIT(8)
+#define RTIT_CTL_MTC_EN			BIT(9)
 #define RTIT_CTL_TSC_EN			BIT(10)
 #define RTIT_CTL_DISRETC		BIT(11)
 #define RTIT_CTL_BRANCH_EN		BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET	14
+#define RTIT_CTL_MTC_RANGE		(0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET	19
+#define RTIT_CTL_CYC_THRESH		(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET	24
+#define RTIT_CTL_PSB_FREQ      		(0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
 #define MSR_IA32_RTIT_STATUS		0x00000571
 #define RTIT_STATUS_CONTEXTEN		BIT(1)
 #define RTIT_STATUS_TRIGGEREN		BIT(2)
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
index 1c338b0eba05..feb293e96531 100644
--- a/arch/x86/kernel/cpu/intel_pt.h
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -72,9 +72,15 @@ struct topa_entry {
 enum pt_capabilities {
 	PT_CAP_max_subleaf = 0,
 	PT_CAP_cr3_filtering,
+	PT_CAP_psb_cyc,
+	PT_CAP_mtc,
 	PT_CAP_topa_output,
 	PT_CAP_topa_multiple_entries,
+	PT_CAP_single_range_output,
 	PT_CAP_payloads_lip,
+	PT_CAP_mtc_periods,
+	PT_CAP_cycle_thresholds,
+	PT_CAP_psb_periods,
 };
 
 struct pt_pmu {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index cc58ef8d9ada..e20cfacb5a32 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -65,9 +65,15 @@ static struct pt_cap_desc {
 } pt_caps[] = {
 	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
 	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
+	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
+	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
 	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
 	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
+	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
 	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
+	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
+	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
+	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
 };
 
 static u32 pt_cap_get(enum pt_capabilities cap)
@@ -94,12 +100,22 @@ static struct attribute_group pt_cap_group = {
 	.name	= "caps",
 };
 
+PMU_FORMAT_ATTR(cyc,		"config:1"	);
+PMU_FORMAT_ATTR(mtc,		"config:9"	);
 PMU_FORMAT_ATTR(tsc,		"config:10"	);
 PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
+PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
+PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
 
 static struct attribute *pt_formats_attr[] = {
+	&format_attr_cyc.attr,
+	&format_attr_mtc.attr,
 	&format_attr_tsc.attr,
 	&format_attr_noretcomp.attr,
+	&format_attr_mtc_period.attr,
+	&format_attr_cyc_thresh.attr,
+	&format_attr_psb_period.attr,
 	NULL,
 };
 
@@ -170,15 +186,65 @@ fail:
 	return ret;
 }
 
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
+			  RTIT_CTL_CYC_THRESH	| \
+			  RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
+			 RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
+			RTIT_CTL_DISRETC	| \
+			RTIT_CTL_CYC_PSB	| \
+			RTIT_CTL_MTC)
 
 static bool pt_event_valid(struct perf_event *event)
 {
 	u64 config = event->attr.config;
+	u64 allowed, requested;
 
 	if ((config & PT_CONFIG_MASK) != config)
 		return false;
 
+	if (config & RTIT_CTL_CYC_PSB) {
+		if (!pt_cap_get(PT_CAP_psb_cyc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_psb_periods);
+		requested = (config & RTIT_CTL_PSB_FREQ) >>
+			RTIT_CTL_PSB_FREQ_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+		requested = (config & RTIT_CTL_CYC_THRESH) >>
+			RTIT_CTL_CYC_THRESH_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+	}
+
+	if (config & RTIT_CTL_MTC) {
+		/*
+		 * In the unlikely case that CPUID lists valid mtc periods,
+		 * but not the mtc capability, drop out here.
+		 *
+		 * Spec says that setting mtc period bits while mtc bit in
+		 * CPUID is 0 will #GP, so better safe than sorry.
+		 */
+		if (!pt_cap_get(PT_CAP_mtc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_mtc_periods);
+		if (!allowed)
+			return false;
+
+		requested = (config & RTIT_CTL_MTC_RANGE) >>
+			RTIT_CTL_MTC_RANGE_OFFSET;
+
+		if (!(allowed & BIT(requested)))
+			return false;
+	}
+
 	return true;
 }
 
-- 
cgit v1.2.3-70-g09d2


From a94cab2376cb35f236be14e2833cef63a8762a31 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 10 May 2015 12:22:39 -0700
Subject: perf/x86: Add a native_perf_sched_clock_from_tsc()

PEBSv3 has a raw TSC time stamp in its memory buffer that
later needs to to be converted to perf_clock.

Add a native_sched_clock_from_tsc() that works the same
as native_sched_clock(), but starts with an already given
TSC value.

Paravirt is ignored, it will just get the native clock.
But there isn't a para virtualized PEBS anyway.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285767-27027-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tsc.h | 1 +
 arch/x86/kernel/tsc.c      | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94605c0e9cee..aad56eb3bbe2 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,7 @@ extern int unsynchronized_tsc(void);
 extern int check_tsc_unstable(void);
 extern int check_tsc_disabled(void);
 extern unsigned long native_calibrate_tsc(void);
+extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
 
 extern int tsc_clocksource_reliable;
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7437b41f6a47..88e9a38c71a5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -296,6 +296,14 @@ u64 native_sched_clock(void)
 	return cycles_2_ns(tsc_now);
 }
 
+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+	return cycles_2_ns(tsc);
+}
+
 /* We need to define a real function for sched_clock, to override the
    weak default version */
 #ifdef CONFIG_PARAVIRT
-- 
cgit v1.2.3-70-g09d2


From 2f7ebf2ec2a7b311318aae10b8373b0bd93001a7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 10 May 2015 12:22:40 -0700
Subject: perf/x86/intel: Add support for PEBSv3 profiling

PEBSv3 is the same as the existing PEBSv2 used on Haswell,
but it adds a new TSC field. Add support to the generic
PEBS handler to handle the new format, and overwrite
the perf time stamp using the new native_sched_clock_from_tsc().

Right now the time stamp is just slightly more accurate,
as it is nearer the actual event trigger point. With
the PEBS threshold > 1 patchkit it will be much more accurate,
avoid the problems with MMAP mismatches earlier.
The accurate time stamping is only implemented for
the default trace clock for now.

v2: Use _skl prefix. Check for default clock_id.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285767-27027-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 36 ++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 71fc40238843..410270a9078f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -224,6 +224,19 @@ union hsw_tsx_tuning {
 
 #define PEBS_HSW_TSX_FLAGS	0xff00000000ULL
 
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+	u64 real_ip, tsx_tuning;
+	u64 tsc;
+};
+
 void init_debug_store_on_cpu(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -885,7 +898,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	return 0;
 }
 
-static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
 {
 	if (pebs->tsx_tuning) {
 		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
@@ -894,7 +907,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
 	return 0;
 }
 
-static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
 {
 	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
 
@@ -918,7 +931,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	 * unconditionally access the 'extra' entries.
 	 */
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct pebs_record_hsw *pebs = __pebs;
+	struct pebs_record_skl *pebs = __pebs;
 	u64 sample_type;
 	int fll, fst, dsrc;
 	int fl = event->hw.flags;
@@ -1016,6 +1029,16 @@ static void setup_pebs_sample_data(struct perf_event *event,
 			data->txn = intel_hsw_transaction(pebs);
 	}
 
+	/*
+	 * v3 supplies an accurate time stamp, so we use that
+	 * for the time stamp.
+	 *
+	 * We can only do this for the default trace clock.
+	 */
+	if (x86_pmu.intel_cap.pebs_format >= 3 &&
+		event->attr.use_clockid == 0)
+		data->time = native_sched_clock_from_tsc(pebs->tsc);
+
 	if (has_branch_stack(event))
 		data->br_stack = &cpuc->lbr_stack;
 }
@@ -1245,6 +1268,13 @@ void __init intel_ds_init(void)
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
 			break;
 
+		case 3:
+			pr_cont("PEBS fmt3%c, ", pebs_type);
+			x86_pmu.pebs_record_size =
+						sizeof(struct pebs_record_skl);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			break;
+
 		default:
 			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
-- 
cgit v1.2.3-70-g09d2


From a7b58d211ba18c9175b139e18b68c86a6bcc3c3f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 27 May 2015 21:13:14 -0700
Subject: perf/x86/intel/lbr: Allow time stamp for free running PEBSv3

With PEBSv3 the PEBS record contains a time stamp. That means we can allow
free-running PEBS without a PMI even if the user program requested a time stamp.
This avoids the need to use -T to get free running PEBS, and also avoids
any problems with mis-identifying MMAPs later.

Move the free_running_flags state into a variable in x86_pmu and use it.
This only works when no explicit clock_id is set.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: eranian@google.com
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1432786398-23861-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h          |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c    | 15 ++++++++++++++-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |  1 +
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3474cf26b6c4..7378b1054e81 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -594,6 +594,7 @@ struct x86_pmu {
 	struct event_constraint *pebs_constraints;
 	void		(*pebs_aliases)(struct perf_event *event);
 	int 		max_pebs_events;
+	unsigned long	free_running_flags;
 
 	/*
 	 * Intel LBR
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 71815cf3d2d3..cb112bffcf70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2257,6 +2257,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
 	}
 }
 
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+	unsigned long flags = x86_pmu.free_running_flags;
+
+	if (event->attr.use_clockid)
+		flags &= ~PERF_SAMPLE_TIME;
+	return flags;
+}
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
 	int ret = x86_pmu_hw_config(event);
@@ -2267,7 +2276,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (event->attr.precise_ip) {
 		if (!event->attr.freq) {
 			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
-			if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
+			if (!(event->attr.sample_type &
+			      ~intel_pmu_free_running_flags(event)))
 				event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
 		}
 		if (x86_pmu.pebs_aliases)
@@ -2689,6 +2699,8 @@ static __initconst const struct x86_pmu core_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
+
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32-bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -2727,6 +2739,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 410270a9078f..03773c230fb0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1273,6 +1273,7 @@ void __init intel_ds_init(void)
 			x86_pmu.pebs_record_size =
 						sizeof(struct pebs_record_skl);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
 			break;
 
 		default:
-- 
cgit v1.2.3-70-g09d2


From 50eab8f6ecd77ae4f9742f8e21ea50705ce0f830 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 10 May 2015 12:22:43 -0700
Subject: perf/x86/intel/lbr: Add support for LBRv5

Add support for the new LBRv5 format used on Intel Skylake CPUs.

The flags for mispredict, abort, in_tx etc. moved to range of separate
LBR_INFO_* MSRs. Teach the LBR code to read those. The original
LBR registers stay the same, except they have full sign
extension now.

LBR_INFO also reports a cycle count to the last branch.
Report the cycle information using the new "cycles" branch_info
output field.

In addition we have to context switch and clear the new INFO
MSRs to avoid any information leaks.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285767-27027-6-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           |  1 +
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 21 ++++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7378b1054e81..e9c5bdf5d94e 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -625,6 +625,7 @@ struct x86_pmu {
 struct x86_perf_task_context {
 	u64 lbr_from[MAX_LBR_ENTRIES];
 	u64 lbr_to[MAX_LBR_ENTRIES];
+	u64 lbr_info[MAX_LBR_ENTRIES];
 	int lbr_callstack_users;
 	int lbr_stack_state;
 };
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 452a7bd2dedb..2fb57373beca 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -13,7 +13,8 @@ enum {
 	LBR_FORMAT_EIP		= 0x02,
 	LBR_FORMAT_EIP_FLAGS	= 0x03,
 	LBR_FORMAT_EIP_FLAGS2	= 0x04,
-	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_EIP_FLAGS2,
+	LBR_FORMAT_INFO		= 0x05,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
 };
 
 static enum {
@@ -186,6 +187,8 @@ static void intel_pmu_lbr_reset_64(void)
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		wrmsrl(x86_pmu.lbr_from + i, 0);
 		wrmsrl(x86_pmu.lbr_to   + i, 0);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + i, 0);
 	}
 }
 
@@ -234,6 +237,8 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 		lbr_idx = (tos - i) & mask;
 		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]);
 	}
 	task_ctx->lbr_stack_state = LBR_NONE;
 }
@@ -255,6 +260,8 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 		lbr_idx = (tos - i) & mask;
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			rdmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]);
 	}
 	task_ctx->lbr_stack_state = LBR_VALID;
 }
@@ -416,11 +423,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		unsigned long lbr_idx = (tos - i) & mask;
 		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
 		int skip = 0;
+		u16 cycles = 0;
 		int lbr_flags = lbr_desc[lbr_format];
 
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
+		if (lbr_format == LBR_FORMAT_INFO) {
+			u64 info;
+
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+			mis = !!(info & LBR_INFO_MISPRED);
+			pred = !mis;
+			in_tx = !!(info & LBR_INFO_IN_TX);
+			abort = !!(info & LBR_INFO_ABORT);
+			cycles = (info & LBR_INFO_CYCLES);
+		}
 		if (lbr_flags & LBR_EIP_FLAGS) {
 			mis = !!(from & LBR_FROM_FLAG_MISPRED);
 			pred = !mis;
@@ -450,6 +468,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[out].predicted = pred;
 		cpuc->lbr_entries[out].in_tx	 = in_tx;
 		cpuc->lbr_entries[out].abort	 = abort;
+		cpuc->lbr_entries[out].cycles	 = cycles;
 		cpuc->lbr_entries[out].reserved	 = 0;
 		out++;
 	}
-- 
cgit v1.2.3-70-g09d2


From d8020bee1d0caa90e7b9d6f39ac1fdfaaee7f67f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 10 May 2015 12:22:45 -0700
Subject: perf/x86/intel: Handle new arch perfmon v4 status bits

ArchPerfmon v4 has some new status bits in GLOBAL_STATUS.

These need to be ignored when deciding whether a NMI
was an NMI, to avoid eating all NMIs when they
stay set, see:

    b292d7a10487 ("perf/x86/intel: ignore CondChgd bit to avoid false NMI handling")

This patch ignores the new ASIF bit, which indicates
that SGX interfered with the PMU, and also the new
LBR freezing bits, which are set when the LBRs get
frozen, plus the existing CondChange (set by JTAG
debuggers and some buggy BIOSes)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285767-27027-8-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index cb112bffcf70..52c9ded70f6c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1611,13 +1611,14 @@ again:
 	intel_pmu_lbr_read();
 
 	/*
-	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
-	 * and clear the bit.
+	 * Ignore a range of extra bits in status that do not indicate
+	 * overflow by themselves.
 	 */
-	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
-		if (!status)
-			goto done;
-	}
+	status &= ~(GLOBAL_STATUS_COND_CHG |
+		    GLOBAL_STATUS_ASIF |
+		    GLOBAL_STATUS_LBRS_FROZEN);
+	if (!status)
+		goto done;
 
 	/*
 	 * PEBS overflow sets bit 62 in the global status register
-- 
cgit v1.2.3-70-g09d2


From 0f29e573dd32bb8598e74271454e97c962da5e05 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 10 May 2015 12:22:47 -0700
Subject: perf/x86/intel: Move PMU ACK to after LBR read

With Arch Perfmon v4 the PMU ack unfreezes the LBRs. So we need to do
the PMU ack after the LBR reading, otherwise the LBRs would be polluted
by the PMI handler.

This is a minimal change. In principle the ACK could be moved much later.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285767-27027-10-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 52c9ded70f6c..da93b4bde963 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1594,6 +1594,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
 	loops = 0;
 again:
+	intel_pmu_lbr_read();
 	intel_pmu_ack_status(status);
 	if (++loops > 100) {
 		static bool warned = false;
@@ -1608,7 +1609,6 @@ again:
 
 	inc_irq_stat(apic_perf_irqs);
 
-	intel_pmu_lbr_read();
 
 	/*
 	 * Ignore a range of extra bits in status that do not indicate
-- 
cgit v1.2.3-70-g09d2


From 425507fa5f321bb5ce1b5eb57a9586e0cf0b9802 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 10 May 2015 12:22:46 -0700
Subject: perf/x86/intel/lbr: Optimize v4 LBR unfreezing

In Arch perfmon v4 the GLOBAL_STATUS reset automatically unfreezes
LBRs. So no need to do it manually in the LBR code. Add a check
to skip it.

v2: Move test up to beginning of function.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285767-27027-9-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 2fb57373beca..769a42f874dc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -140,6 +140,13 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	u64 debugctl, lbr_select = 0, orig_debugctl;
 
+	/*
+	 * No need to unfreeze manually, as v4 can do that as part
+	 * of the GLOBAL_STATUS ack.
+	 */
+	if (pmi && x86_pmu.version >= 4)
+		return;
+
 	/*
 	 * No need to reprogram LBR_SELECT in a PMI, as it
 	 * did not change.
-- 
cgit v1.2.3-70-g09d2


From 9a92e16fd7b4ccd9aabcbc4d42a3fb5f9a3cf4a1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 10 May 2015 12:22:44 -0700
Subject: perf/x86/intel: Add Intel Skylake PMU support

Add perf core PMU support for future Intel Skylake CPU cores.

The code is based on Haswell/Broadwell.

There is a new cache event list, based on the updated Haswell
event list.

Skylake has removed most counter constraints on basic
events, so the basic constraints table now only has a single
entry (plus the fixed counters).

TSX support and various other setups are all shared with Haswell.

Skylake has 32 LBR entries. Add a new LBR init function
to set this up. The filters are all the same as Haswell.

It also has a new LBR format with a separate LBR_INFO_* MSR,
but that has been already added earlier.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285767-27027-7-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           |   6 +-
 arch/x86/kernel/cpu/perf_event_intel.c     | 232 +++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel_ds.c  |  22 +++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |  20 +++
 4 files changed, 279 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index e9c5bdf5d94e..8ad9241ddcf5 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -165,7 +165,7 @@ struct intel_excl_cntrs {
 	unsigned	core_id;	/* per-core: core id */
 };
 
-#define MAX_LBR_ENTRIES		16
+#define MAX_LBR_ENTRIES		32
 
 enum {
 	X86_PERF_KFREE_SHARED = 0,
@@ -861,6 +861,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
 
 extern struct event_constraint intel_hsw_pebs_event_constraints[];
 
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
 
 void intel_pmu_pebs_enable(struct perf_event *event);
@@ -899,6 +901,8 @@ void intel_pmu_lbr_init_snb(void);
 
 void intel_pmu_lbr_init_hsw(void);
 
+void intel_pmu_lbr_init_skl(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index da93b4bde963..28fc27202d28 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_skl_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	EVENT_CONSTRAINT_END
+};
+
 static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -193,6 +201,13 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
 	EVENT_EXTRA_END
 };
 
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	EVENT_EXTRA_END
+};
+
 EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
 EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
@@ -244,6 +259,200 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD		BIT_ULL(0)
+#define SKL_DEMAND_RFO			BIT_ULL(1)
+#define SKL_ANY_RESPONSE		BIT_ULL(16)
+#define SKL_SUPPLIER_NONE		BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM		BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM	BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM	BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM	BIT_ULL(29)
+#define SKL_L3_MISS			(SKL_L3_MISS_LOCAL_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT			BIT_ULL(30)
+#define SKL_SNOOP_NONE			BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define SKL_SNOOP_MISS			BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define SKL_SNOOP_HITM			BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM		BIT_ULL(37)
+#define SKL_ANY_SNOOP			(SKL_SPL_HIT|SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ			SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM			(SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE		SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS			SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE		(SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x283,	/* ICACHE_64B.MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x608,	/* DTLB_LOAD_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x649,	/* DTLB_STORE_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0xe85,	/* ITLB_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 #define SNB_DMND_DATA_RD	(1ULL << 0)
 #define SNB_DMND_RFO		(1ULL << 1)
 #define SNB_DMND_IFETCH		(1ULL << 2)
@@ -3278,6 +3487,29 @@ __init int intel_pmu_init(void)
 		pr_cont("Broadwell events, ");
 		break;
 
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		intel_pmu_lbr_init_skl();
+
+		x86_pmu.event_constraints = intel_skl_event_constraints;
+		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_skl_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		WARN_ON(!x86_pmu.format_attrs);
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Skylake events, ");
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 03773c230fb0..2f7ee0541a8f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -688,6 +688,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 769a42f874dc..b432c478e068 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -973,6 +973,26 @@ void intel_pmu_lbr_init_hsw(void)
 	pr_cont("16-deep LBR, ");
 }
 
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+	x86_pmu.lbr_nr	 = 32;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
+	pr_cont("32-deep LBR, ");
+}
+
 /* atom */
 void __init intel_pmu_lbr_init_atom(void)
 {
-- 
cgit v1.2.3-70-g09d2


From e0573364b8c5b17401569ef581f1625803210f4d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 27 May 2015 21:13:17 -0700
Subject: perf/x86/intel/lbr: Use correct index to save/restore LBR_INFO with
 call stack

Use the correct index to save/restore the LBR_INFO_x MSR in
callstack mode. This is more a cleanup, as even with the wrong
index the register was correctly saved/restored, and also
LBR callgraph mode in perf tools do not really need anything in
LBR_INFO. But still better to use the right index.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1432786398-23861-5-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index b432c478e068..a5bc424569b9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -245,7 +245,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]);
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
 	}
 	task_ctx->lbr_stack_state = LBR_NONE;
 }
@@ -268,7 +268,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			rdmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]);
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
 	}
 	task_ctx->lbr_stack_state = LBR_VALID;
 }
-- 
cgit v1.2.3-70-g09d2


From 90405aa02247c1a6313c33e2253f9fd2299ae60b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 27 May 2015 21:13:18 -0700
Subject: perf/x86/intel/lbr: Limit LBR accesses to TOS in callstack mode

In callstack mode the LBR is not a ring buffer, but a stack that grows up
and down. This means in  this case we don't need to access all LBRs, only the
ones up to TOS. Do this optimization for the normal LBR read, and the context
switch save/restore code. For save/restore it can be done unconditionally, as
it only runs when call stack mode is active.

This recovers some of the cost of going to 32 LBRs on Skylake.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1432786398-23861-6-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index a5bc424569b9..b2c9475b7ff2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -240,7 +240,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 
 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+	for (i = 0; i < tos; i++) {
 		lbr_idx = (tos - i) & mask;
 		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
@@ -263,7 +263,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 
 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+	for (i = 0; i < tos; i++) {
 		lbr_idx = (tos - i) & mask;
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
@@ -425,8 +425,12 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 	int out = 0;
+	int num = x86_pmu.lbr_nr;
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+	if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+		num = tos;
+
+	for (i = 0; i < num; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
 		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
 		int skip = 0;
-- 
cgit v1.2.3-70-g09d2


From 47732d886385af769449022a02c7cf0ce45d8a5c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 29 Jun 2015 14:22:13 -0700
Subject: perf/x86: Make merge_attr() global to use from perf_event_intel

merge_attr() allows to merge two sysfs attribute tables.
Export it to be usable by other files too.

Next patch is going to use that to extend the sysfs format
attributes for a CPU.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1435612935-24425-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 2 +-
 arch/x86/kernel/cpu/perf_event.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3658de47900f..8bac4bbecb9b 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1551,7 +1551,7 @@ static void __init filter_events(struct attribute **attrs)
 }
 
 /* Merge two pointer arrays */
-static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
 {
 	struct attribute **new;
 	int j, i;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8ad9241ddcf5..5edf6d868fc1 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -795,6 +795,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
 ssize_t intel_event_sysfs_show(char *page, u64 config);
 
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
 #ifdef CONFIG_CPU_SUP_AMD
 
 int amd_pmu_init(void);
@@ -926,6 +928,7 @@ static inline int is_ht_workaround_enabled(void)
 {
 	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
 }
+
 #else /* CONFIG_CPU_SUP_INTEL */
 
 static inline void reserve_ds_buffers(void)
-- 
cgit v1.2.3-70-g09d2


From 8c4fe7095d633dd5543690ea5c3d522c5cd989b6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 30 Jun 2015 16:33:24 -0700
Subject: perf/x86/intel: Use 0x11 as extra reg test value

The next patch adds a new perf extra register where 0x1ff is not a valid
value. Use 0x11 instead.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1435707205-6676-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 28fc27202d28..a478e3c4cc3f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3579,7 +3579,7 @@ __init int intel_pmu_init(void)
 	 */
 	if (x86_pmu.extra_regs) {
 		for (er = x86_pmu.extra_regs; er->msr; er++) {
-			er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+			er->extra_msr_access = check_msr(er->msr, 0x11UL);
 			/* Disable LBR select mapping */
 			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
 				x86_pmu.lbr_sel_map = NULL;
-- 
cgit v1.2.3-70-g09d2


From 070e98873cf7196cad58f8b6e5278dd5533c81f0 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 2 Jul 2015 08:12:52 -0400
Subject: perf/x86/intel/uncore: Add Broadwell-DE uncore support

The uncore subsystem for Broadwell-DE is similar to Haswell-EP.  There
are some differences in pci device IDs, box number and constraints.

Please refer to the public document:

  http://www.intel.com/content/www/us/en/processors/xeon/xeon-d-1500-uncore-performance-monitoring.html

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1435839172-15114-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c      |   6 +
 arch/x86/kernel/cpu/perf_event_intel_uncore.h      |   2 +
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 164 +++++++++++++++++++++
 3 files changed, 172 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index c2af96716d62..560e5255b15e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -911,6 +911,9 @@ static int __init uncore_pci_init(void)
 	case 63: /* Haswell-EP */
 		ret = hswep_uncore_pci_init();
 		break;
+	case 86: /* BDX-DE */
+		ret = bdx_uncore_pci_init();
+		break;
 	case 42: /* Sandy Bridge */
 		ret = snb_uncore_pci_init();
 		break;
@@ -1229,6 +1232,9 @@ static int __init uncore_cpu_init(void)
 	case 63: /* Haswell-EP */
 		hswep_uncore_cpu_init();
 		break;
+	case 86: /* BDX-DE */
+		bdx_uncore_cpu_init();
+		break;
 	default:
 		return 0;
 	}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a196e4..72c54c2e5b1a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -336,6 +336,8 @@ int ivbep_uncore_pci_init(void);
 void ivbep_uncore_cpu_init(void);
 int hswep_uncore_pci_init(void);
 void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
 
 /* perf_event_intel_uncore_nhmex.c */
 void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 76a3feb15ab1..694510a887dc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -2321,3 +2321,167 @@ int hswep_uncore_pci_init(void)
 	return 0;
 }
 /* end of Haswell-EP uncore support */
+
+/* BDX-DE uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_msr_ops,
+	.format_group		= &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= bdx_uncore_cbox_constraints,
+	.ops			= &hswep_uncore_cbox_ops,
+	.format_group		= &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+	&bdx_uncore_ubox,
+	&bdx_uncore_cbox,
+	&hswep_uncore_pcu,
+	NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = bdx_msr_uncores;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 5,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= hswep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &hswep_uncore_irp_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= bdx_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	BDX_PCI_UNCORE_HA,
+	BDX_PCI_UNCORE_IMC,
+	BDX_PCI_UNCORE_IRP,
+	BDX_PCI_UNCORE_R2PCIE,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha,
+	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc,
+	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp,
+	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie,
+	NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+	.name		= "bdx_uncore",
+	.id_table	= bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x6f1e);
+
+	if (ret)
+		return ret;
+	uncore_pci_uncores = bdx_pci_uncores;
+	uncore_pci_driver = &bdx_uncore_pci_driver;
+	return 0;
+}
+
+/* end of BDX-DE uncore support */
-- 
cgit v1.2.3-70-g09d2


From b7b7c7821d932ba18ef6c8eafc8536066b4c2ef4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Jul 2015 11:49:06 -0400
Subject: perf/x86: Add an MSR PMU driver

This patch adds an MSR PMU to support free running MSR counters. Such
as time and freq related counters includes TSC, IA32_APERF, IA32_MPERF
and IA32_PPERF, but also SMI_COUNT.

The events are exposed in sysfs for use by perf stat and other tools.
The files are under /sys/devices/msr/events/

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Kan Liang <kan.liang@intel.com>
[ s/freq/msr/, added SMI_COUNT, fixed bugs. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: adrian.hunter@intel.com
Cc: dsahern@gmail.com
Cc: eranian@google.com
Cc: jolsa@kernel.org
Cc: mark.rutland@arm.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/1437407346-31186-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/Makefile         |   2 +
 arch/x86/kernel/cpu/perf_event_msr.c | 242 +++++++++++++++++++++++++++++++++++
 2 files changed, 244 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/perf_event_msr.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 9bff68798836..4eb065c6bed2 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
 					   perf_event_intel_uncore_snbep.o \
 					   perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
+obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
 
 
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
new file mode 100644
index 000000000000..af216e9223e8
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -0,0 +1,242 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+	PERF_MSR_TSC			= 0,
+	PERF_MSR_APERF			= 1,
+	PERF_MSR_MPERF			= 2,
+	PERF_MSR_PPERF			= 3,
+	PERF_MSR_SMI			= 4,
+
+	PERF_MSR_EVENT_MAX,
+};
+
+struct perf_msr {
+	int	id;
+	u64	msr;
+};
+
+static struct perf_msr msr[] = {
+	{ PERF_MSR_TSC, 0 },
+	{ PERF_MSR_APERF, MSR_IA32_APERF },
+	{ PERF_MSR_MPERF, MSR_IA32_MPERF },
+	{ PERF_MSR_PPERF, MSR_PPERF },
+	{ PERF_MSR_SMI, MSR_SMI_COUNT },
+};
+
+PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+	&evattr_tsc.attr.attr,
+};
+
+static struct attribute_group events_attr_group = {
+	.name = "events",
+	.attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+static struct attribute_group format_attr_group = {
+	.name = "format",
+	.attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&events_attr_group,
+	&format_attr_group,
+	NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (cfg >= PERF_MSR_EVENT_MAX)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	event->hw.idx = -1;
+	event->hw.event_base = msr[cfg].msr;
+	event->hw.config = cfg;
+
+	return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+	u64 now;
+
+	if (event->hw.event_base)
+		rdmsrl(event->hw.event_base, now);
+	else
+		now = rdtsc();
+
+	return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+	u64 prev, now;
+	s64 delta;
+
+	/* Careful, an NMI might modify the previous event value. */
+again:
+	prev = local64_read(&event->hw.prev_count);
+	now = msr_read_counter(event);
+
+	if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+		goto again;
+
+	delta = now - prev;
+	if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
+		delta <<= 32;
+		delta >>= 32; /* sign extend */
+	}
+	local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+	u64 now;
+
+	now = msr_read_counter(event);
+	local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+	msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+	msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		msr_event_start(event, flags);
+
+	return 0;
+}
+
+static struct pmu pmu_msr = {
+	.task_ctx_nr	= perf_sw_context,
+	.attr_groups	= attr_groups,
+	.event_init	= msr_event_init,
+	.add		= msr_event_add,
+	.del		= msr_event_del,
+	.start		= msr_event_start,
+	.stop		= msr_event_stop,
+	.read		= msr_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init intel_msr_init(int idx)
+{
+	if (boot_cpu_data.x86 != 6)
+		return 0;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+		events_attrs[idx++] = &evattr_smi.attr.attr;
+		break;
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		events_attrs[idx++] = &evattr_pperf.attr.attr;
+		events_attrs[idx++] = &evattr_smi.attr.attr;
+		break;
+
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 76: /* 14nm Atom "Airmont"                   */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+		events_attrs[idx++] = &evattr_smi.attr.attr;
+		break;
+	}
+
+	events_attrs[idx] = NULL;
+
+	return 0;
+}
+
+static int __init amd_msr_init(int idx)
+{
+	return 0;
+}
+
+static int __init msr_init(void)
+{
+	int err;
+	int idx = 1;
+
+	if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
+		events_attrs[idx++] = &evattr_aperf.attr.attr;
+		events_attrs[idx++] = &evattr_mperf.attr.attr;
+		events_attrs[idx] = NULL;
+	}
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		err = intel_msr_init(idx);
+		break;
+
+	case X86_VENDOR_AMD:
+		err = amd_msr_init(idx);
+		break;
+
+	default:
+		err = -ENOTSUPP;
+	}
+
+	if (err != 0) {
+		pr_cont("no msr PMU driver.\n");
+		return 0;
+	}
+
+	perf_pmu_register(&pmu_msr, "msr", -1);
+
+	return 0;
+}
+device_initcall(msr_init);
-- 
cgit v1.2.3-70-g09d2


From 2a853e1123cbbb43ff74cc47b4ec582ce34df262 Mon Sep 17 00:00:00 2001
From: "Liang, Kan" <kan.liang@intel.com>
Date: Fri, 3 Jul 2015 20:08:27 +0000
Subject: perf/x86/intel/pebs: Fix event disable PEBS buffer drain

When disabling a PEBS event, we need to drain the buffer. Doing so
requires a correct cpuc->pebs_active mask.

The current code clears the pebs_active bit before draining the
buffer. Fix that.

Signed-off-by: "Liang, Kan" <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver<vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/37D7C6CF3E00A74B8858931C1DB2F07701885A65@SHSMSX103.ccr.corp.intel.com
[ Fixed the SOB. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 2f7ee0541a8f..8e564f817a7f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -789,6 +789,11 @@ void intel_pmu_pebs_disable(struct perf_event *event)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	struct debug_store *ds = cpuc->ds;
+	bool large_pebs = ds->pebs_interrupt_threshold >
+		ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+	if (large_pebs)
+		intel_pmu_drain_pebs_buffer();
 
 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
 
@@ -797,12 +802,8 @@ void intel_pmu_pebs_disable(struct perf_event *event)
 	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled &= ~(1ULL << 63);
 
-	if (ds->pebs_interrupt_threshold >
-	    ds->pebs_buffer_base + x86_pmu.pebs_record_size) {
-		intel_pmu_drain_pebs_buffer();
-		if (!pebs_is_enabled(cpuc))
-			perf_sched_cb_dec(event->ctx->pmu);
-	}
+	if (large_pebs && !pebs_is_enabled(cpuc))
+		perf_sched_cb_dec(event->ctx->pmu);
 
 	if (cpuc->enabled)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-- 
cgit v1.2.3-70-g09d2


From 75f80859b130a1cc84e59e71295ce2dd51fe1c81 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 15 Jul 2015 14:35:46 +0200
Subject: perf/x86/intel/pebs: Robustify PEBS buffer drain

Vince Weaver and Stephane Eranian reported warnings in the PEBS
code when running the perf fuzzer. Stephane wrote:

  > I can reproduce the problem on my HSW running the fuzzer.
  >
  > I can see why this could be happening if you are mixing PEBS and non PEBS events
  > in the bottom 4 counters. I suspect:
  >         for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
  >                 if ((counts[bit] == 0) && (error[bit] == 0))
  >                         continue;
  >
  > This test is not correct when you have non-PEBS events mixed with
  > PEBS events and they overflow at the same time. They will have
  > counts[i] != 0 but error[i] == 0, and thus you fall thru the loop
  > and hit the assert. Or it is something along those lines.

The only way I can make this work is if ->status only has !PEBS events
set, because if it has both set we'll take that slow path which masks
out the !PEBS bits.

After masking there are 3 options:

 - there is one bit set, and its @bit, we increment counts[bit].

 - there are multiple bits set, we increment error[] for each set bit,
   we do not increment counts[].

 - there are no bits set, we do nothing.

The intent was to never increment counts[] for !PEBS events.

Now if we start out with only a single !PEBS event set, we'll pass the
test and increment counts[] for a !PEBS and hit the warn.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 34 +++++++++++++++----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 8e564f817a7f..84f236ab96b0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1188,6 +1188,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 
 	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
 		struct pebs_record_nhm *p = at;
+		u64 pebs_status;
 
 		/* PEBS v3 has accurate status bits */
 		if (x86_pmu.intel_cap.pebs_format >= 3) {
@@ -1198,12 +1199,17 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 			continue;
 		}
 
-		bit = find_first_bit((unsigned long *)&p->status,
+		pebs_status = p->status & cpuc->pebs_enabled;
+		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+		bit = find_first_bit((unsigned long *)&pebs_status,
 					x86_pmu.max_pebs_events);
-		if (bit >= x86_pmu.max_pebs_events)
-			continue;
-		if (!test_bit(bit, cpuc->active_mask))
+		if (WARN(bit >= x86_pmu.max_pebs_events,
+			 "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
+			 (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
+			 *(unsigned long long *)cpuc->active_mask))
 			continue;
+
 		/*
 		 * The PEBS hardware does not deal well with the situation
 		 * when events happen near to each other and multiple bits
@@ -1218,27 +1224,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		 * one, and it's not possible to reconstruct all events
 		 * that caused the PEBS record. It's called collision.
 		 * If collision happened, the record will be dropped.
-		 *
 		 */
-		if (p->status != (1 << bit)) {
-			u64 pebs_status;
-
-			/* slow path */
-			pebs_status = p->status & cpuc->pebs_enabled;
-			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
-			if (pebs_status != (1 << bit)) {
-				for_each_set_bit(i, (unsigned long *)&pebs_status,
-						 MAX_PEBS_EVENTS)
-					error[i]++;
-				continue;
-			}
+		if (p->status != (1ULL << bit)) {
+			for_each_set_bit(i, (unsigned long *)&pebs_status,
+					 x86_pmu.max_pebs_events)
+				error[i]++;
+			continue;
 		}
+
 		counts[bit]++;
 	}
 
 	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
 		if ((counts[bit] == 0) && (error[bit] == 0))
 			continue;
+
 		event = cpuc->events[bit];
 		WARN_ON_ONCE(!event);
 		WARN_ON_ONCE(!event->attr.precise_ip);
-- 
cgit v1.2.3-70-g09d2


From 2517281d63a2b09d94aedfb522943617048f337e Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Sat, 1 Aug 2015 16:08:07 -0700
Subject: Drivers: hv: vmbus: add special kexec handler

When general-purpose kexec (not kdump) is being performed in Hyper-V guest
the newly booted kernel fails with an MCE error coming from the host. It
is the same error which was fixed in the "Drivers: hv: vmbus: Implement
the protocol for tearing down vmbus state" commit - monitor pages remain
special and when they're being written to (as the new kernel doesn't know
these pages are special) bad things happen. We need to perform some
minimalistic cleanup before booting a new kernel on kexec. To do so we
need to register a special machine_ops.shutdown handler to be executed
before the native_machine_shutdown(). Registering a shutdown notification
handler via the register_reboot_notifier() call is not sufficient as it
happens to early for our purposes. machine_ops is not being exported to
modules (and I don't think we want to export it) so let's do this in
mshyperv.c

The minimalistic cleanup consists of cleaning up clockevents, synic MSRs,
guest os id MSR, and hypercall MSR.

Kdump doesn't require all this stuff as it lives in a separate memory
space.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h |  2 ++
 arch/x86/kernel/cpu/mshyperv.c  | 24 ++++++++++++++++++++++++
 drivers/hv/vmbus_drv.c          | 14 ++++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index c163215abb9a..d3db9108346b 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -20,4 +20,6 @@ void hyperv_vector_handler(struct pt_regs *regs);
 void hv_setup_vmbus_irq(void (*handler)(void));
 void hv_remove_vmbus_irq(void);
 
+void hv_setup_kexec_handler(void (*handler)(void));
+void hv_remove_kexec_handler(void);
 #endif
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index aad4bd84b475..fa483ed4756e 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
 #include <linux/efi.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/kexec.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv.h>
@@ -28,10 +29,13 @@
 #include <asm/i8259.h>
 #include <asm/apic.h>
 #include <asm/timer.h>
+#include <asm/reboot.h>
 
 struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
 
+static void (*hv_kexec_handler)(void);
+
 #if IS_ENABLED(CONFIG_HYPERV)
 static void (*vmbus_handler)(void);
 
@@ -67,8 +71,27 @@ void hv_remove_vmbus_irq(void)
 }
 EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
 EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+	hv_kexec_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
+
+void hv_remove_kexec_handler(void)
+{
+	hv_kexec_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
 #endif
 
+static void hv_machine_shutdown(void)
+{
+	if (kexec_in_progress && hv_kexec_handler)
+		hv_kexec_handler();
+	native_machine_shutdown();
+}
+
 static uint32_t  __init ms_hyperv_platform(void)
 {
 	u32 eax;
@@ -141,6 +164,7 @@ static void __init ms_hyperv_init_platform(void)
 	no_timer_check = 1;
 #endif
 
+	machine_ops.shutdown = hv_machine_shutdown;
 }
 
 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 00d5158b4ac7..31748a21d8f9 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1060,6 +1060,17 @@ static struct acpi_driver vmbus_acpi_driver = {
 	},
 };
 
+static void hv_kexec_handler(void)
+{
+	int cpu;
+
+	hv_synic_clockevents_cleanup();
+	vmbus_initiate_unload();
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, hv_synic_cleanup, NULL, 1);
+	hv_cleanup();
+};
+
 static int __init hv_acpi_init(void)
 {
 	int ret, t;
@@ -1092,6 +1103,8 @@ static int __init hv_acpi_init(void)
 	if (ret)
 		goto cleanup;
 
+	hv_setup_kexec_handler(hv_kexec_handler);
+
 	return 0;
 
 cleanup:
@@ -1104,6 +1117,7 @@ static void __exit vmbus_exit(void)
 {
 	int cpu;
 
+	hv_remove_kexec_handler();
 	vmbus_connection.conn_state = DISCONNECTED;
 	hv_synic_clockevents_cleanup();
 	vmbus_disconnect();
-- 
cgit v1.2.3-70-g09d2


From b4370df2b1f5158de028e167974263c5757b34a6 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Sat, 1 Aug 2015 16:08:09 -0700
Subject: Drivers: hv: vmbus: add special crash handler

Full kernel hang is observed when kdump kernel starts after a crash. This
hang happens in vmbus_negotiate_version() function on
wait_for_completion() as Hyper-V host (Win2012R2 in my testing) never
responds to CHANNELMSG_INITIATE_CONTACT as it thinks the connection is
already established. We need to perform some mandatory minimalistic
cleanup before we start new kernel.

Reported-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h |  2 ++
 arch/x86/kernel/cpu/mshyperv.c  | 22 ++++++++++++++++++++++
 drivers/hv/vmbus_drv.c          | 14 ++++++++++++++
 3 files changed, 38 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index d3db9108346b..d02f9c9a6592 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -22,4 +22,6 @@ void hv_remove_vmbus_irq(void);
 
 void hv_setup_kexec_handler(void (*handler)(void));
 void hv_remove_kexec_handler(void);
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs));
+void hv_remove_crash_handler(void);
 #endif
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index fa483ed4756e..5ea5bbcc85b3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -35,6 +35,7 @@ struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
 
 static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
 
 #if IS_ENABLED(CONFIG_HYPERV)
 static void (*vmbus_handler)(void);
@@ -83,6 +84,18 @@ void hv_remove_kexec_handler(void)
 	hv_kexec_handler = NULL;
 }
 EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+	hv_crash_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
+
+void hv_remove_crash_handler(void)
+{
+	hv_crash_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
 #endif
 
 static void hv_machine_shutdown(void)
@@ -92,6 +105,14 @@ static void hv_machine_shutdown(void)
 	native_machine_shutdown();
 }
 
+static void hv_machine_crash_shutdown(struct pt_regs *regs)
+{
+	if (hv_crash_handler)
+		hv_crash_handler(regs);
+	native_machine_crash_shutdown(regs);
+}
+
+
 static uint32_t  __init ms_hyperv_platform(void)
 {
 	u32 eax;
@@ -165,6 +186,7 @@ static void __init ms_hyperv_init_platform(void)
 #endif
 
 	machine_ops.shutdown = hv_machine_shutdown;
+	machine_ops.crash_shutdown = hv_machine_crash_shutdown;
 }
 
 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 31748a21d8f9..1ed9b32853a1 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1071,6 +1071,18 @@ static void hv_kexec_handler(void)
 	hv_cleanup();
 };
 
+static void hv_crash_handler(struct pt_regs *regs)
+{
+	vmbus_initiate_unload();
+	/*
+	 * In crash handler we can't schedule synic cleanup for all CPUs,
+	 * doing the cleanup for current CPU only. This should be sufficient
+	 * for kdump.
+	 */
+	hv_synic_cleanup(NULL);
+	hv_cleanup();
+};
+
 static int __init hv_acpi_init(void)
 {
 	int ret, t;
@@ -1104,6 +1116,7 @@ static int __init hv_acpi_init(void)
 		goto cleanup;
 
 	hv_setup_kexec_handler(hv_kexec_handler);
+	hv_setup_crash_handler(hv_crash_handler);
 
 	return 0;
 
@@ -1118,6 +1131,7 @@ static void __exit vmbus_exit(void)
 	int cpu;
 
 	hv_remove_kexec_handler();
+	hv_remove_crash_handler();
 	vmbus_connection.conn_state = DISCONNECTED;
 	hv_synic_clockevents_cleanup();
 	vmbus_disconnect();
-- 
cgit v1.2.3-70-g09d2


From cc2dd4027a43bb36c846f195a764edabc0828602 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Sat, 1 Aug 2015 16:08:20 -0700
Subject: mshyperv: fix recognition of Hyper-V guest crash MSR's

Hypervisor Top Level Functional Specification v3.1/4.0 notes that cpuid
(0x40000003) EDX's 10th bit should be used to check that Hyper-V guest
crash MSR's functionality available.

This patch should fix this recognition. Currently the code checks EAX
register instead of EDX.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 1 +
 arch/x86/kernel/cpu/mshyperv.c  | 1 +
 drivers/hv/vmbus_drv.c          | 4 ++--
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index d02f9c9a6592..aaf59b7da98a 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -7,6 +7,7 @@
 
 struct ms_hyperv_info {
 	u32 features;
+	u32 misc_features;
 	u32 hints;
 };
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 5ea5bbcc85b3..f794bfa3c138 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -158,6 +158,7 @@ static void __init ms_hyperv_init_platform(void)
 	 * Extract the features and hints
 	 */
 	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+	ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
 	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
 	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index b6114cc89aeb..e7b0bcd453e7 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -871,7 +871,7 @@ static int vmbus_bus_init(int irq)
 	/*
 	 * Only register if the crash MSRs are available
 	 */
-	if (ms_hyperv.features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
 		register_die_notifier(&hyperv_die_block);
 		atomic_notifier_chain_register(&panic_notifier_list,
 					       &hyperv_panic_block);
@@ -1169,7 +1169,7 @@ static void __exit vmbus_exit(void)
 	hv_remove_vmbus_irq();
 	tasklet_kill(&msg_dpc);
 	vmbus_free_channels();
-	if (ms_hyperv.features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
 		unregister_die_notifier(&hyperv_die_block);
 		atomic_notifier_chain_unregister(&panic_notifier_list,
 						 &hyperv_panic_block);
-- 
cgit v1.2.3-70-g09d2


From df54c4934e030e73cb6a7bd6713f697350dabd0b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 2 Aug 2015 20:38:23 +0000
Subject: x86/irq: Protect smp_cleanup_move

smp_cleanup_move fiddles without protection in the interrupt
descriptors and the vector array. A concurrent irq setup/teardown or
affinity setting can pull the rug under that operation.

Add proper locking.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Link: http://lkml.kernel.org/r/20150802203609.222975294@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/vector.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f47069e8efac..63d58b08a109 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -539,6 +539,9 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 
 	entering_ack_irq();
 
+	/* Prevent vectors vanishing under us */
+	raw_spin_lock(&vector_lock);
+
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		int irq;
@@ -546,6 +549,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 		struct irq_desc *desc;
 		struct apic_chip_data *data;
 
+	retry:
 		irq = __this_cpu_read(vector_irq[vector]);
 
 		if (irq <= VECTOR_UNDEFINED)
@@ -555,12 +559,16 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 		if (!desc)
 			continue;
 
+		if (!raw_spin_trylock(&desc->lock)) {
+			raw_spin_unlock(&vector_lock);
+			cpu_relax();
+			raw_spin_lock(&vector_lock);
+			goto retry;
+		}
+
 		data = apic_chip_data(&desc->irq_data);
 		if (!data)
-			continue;
-
-		raw_spin_lock(&desc->lock);
-
+			goto unlock;
 		/*
 		 * Check if the irq migration is in progress. If so, we
 		 * haven't received the cleanup request yet for this irq.
@@ -589,6 +597,8 @@ unlock:
 		raw_spin_unlock(&desc->lock);
 	}
 
+	raw_spin_unlock(&vector_lock);
+
 	exiting_irq();
 }
 
-- 
cgit v1.2.3-70-g09d2


From 24c70e07a0311a98dbb5e7a7472fa96a22b789d3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 2 Aug 2015 20:38:24 +0000
Subject: x86/irq: Replace numeric constant

Use the proper define instead of 0.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Link: http://lkml.kernel.org/r/20150802203609.385495420@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index bc28496fd196..35d4cb287771 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -401,8 +401,8 @@ int check_irq_vectors_for_cpu_disable(void)
 		for (vector = FIRST_EXTERNAL_VECTOR;
 		     vector < first_system_vector; vector++) {
 			if (!test_bit(vector, used_vectors) &&
-			    per_cpu(vector_irq, cpu)[vector] < 0)
-					count++;
+			    per_cpu(vector_irq, cpu)[vector] <= VECTOR_UNDEFINED)
+				count++;
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 7276c6a2cb5f7ae46fd0c9539af02dbcb7c4f3f5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 2 Aug 2015 20:38:25 +0000
Subject: x86/irq: Rename VECTOR_UNDEFINED to VECTOR_UNUSED

VECTOR_UNDEFINED is a misnomer. The vector is defined, but unused.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Link: http://lkml.kernel.org/r/20150802203609.477282494@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/hw_irq.h |  2 +-
 arch/x86/kernel/apic/vector.c | 15 +++++++--------
 arch/x86/kernel/irq.c         |  8 ++++----
 arch/x86/kernel/irqinit.c     |  4 ++--
 4 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6615032e19c8..62bb8d23b826 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -182,7 +182,7 @@ extern char irq_entries_start[];
 #define trace_irq_entries_start irq_entries_start
 #endif
 
-#define VECTOR_UNDEFINED	(-1)
+#define VECTOR_UNUSED		(-1)
 #define VECTOR_RETRIGGERED	(-2)
 
 typedef int vector_irq_t[NR_VECTORS];
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 63d58b08a109..9a6d11258684 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -169,8 +169,7 @@ next:
 			goto next;
 
 		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
-			if (per_cpu(vector_irq, new_cpu)[vector] >
-			    VECTOR_UNDEFINED)
+			if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNUSED)
 				goto next;
 		}
 		/* Found one! */
@@ -232,7 +231,7 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
 
 	vector = data->cfg.vector;
 	for_each_cpu_and(cpu, data->domain, cpu_online_mask)
-		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 
 	data->cfg.vector = 0;
 	cpumask_clear(data->domain);
@@ -247,7 +246,7 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
 		     vector++) {
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
 				continue;
-			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 			break;
 		}
 	}
@@ -423,12 +422,12 @@ static void __setup_vector_irq(int cpu)
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		irq = per_cpu(vector_irq, cpu)[vector];
-		if (irq <= VECTOR_UNDEFINED)
+		if (irq <= VECTOR_UNUSED)
 			continue;
 
 		data = apic_chip_data(irq_get_irq_data(irq));
 		if (!cpumask_test_cpu(cpu, data->domain))
-			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 	}
 }
 
@@ -552,7 +551,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 	retry:
 		irq = __this_cpu_read(vector_irq[vector]);
 
-		if (irq <= VECTOR_UNDEFINED)
+		if (irq <= VECTOR_UNUSED)
 			continue;
 
 		desc = irq_to_desc(irq);
@@ -592,7 +591,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
 			goto unlock;
 		}
-		__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+		__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 unlock:
 		raw_spin_unlock(&desc->lock);
 	}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 35d4cb287771..931bdd2f9759 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -228,7 +228,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 					     __func__, smp_processor_id(),
 					     vector, irq);
 		} else {
-			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 		}
 	}
 
@@ -401,7 +401,7 @@ int check_irq_vectors_for_cpu_disable(void)
 		for (vector = FIRST_EXTERNAL_VECTOR;
 		     vector < first_system_vector; vector++) {
 			if (!test_bit(vector, used_vectors) &&
-			    per_cpu(vector_irq, cpu)[vector] <= VECTOR_UNDEFINED)
+			    per_cpu(vector_irq, cpu)[vector] <= VECTOR_UNUSED)
 				count++;
 		}
 	}
@@ -506,7 +506,7 @@ void fixup_irqs(void)
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		unsigned int irr;
 
-		if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
+		if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNUSED)
 			continue;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -524,7 +524,7 @@ void fixup_irqs(void)
 			raw_spin_unlock(&desc->lock);
 		}
 		if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
-			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 	}
 }
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a3a5e158ed69..32a637ffdf98 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
 };
 
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
+	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
 int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
+		if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNUSED)
 			return 1;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 44825757a3ee37c030165a94d6b4dd79c564f661 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 2 Aug 2015 20:38:25 +0000
Subject: x86/irq: Get rid of an indentation level

Make the code simpler to read.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Link: http://lkml.kernel.org/r/20150802203609.555253675@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq.c | 72 +++++++++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 37 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 931bdd2f9759..140950fb9902 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -342,47 +342,45 @@ int check_irq_vectors_for_cpu_disable(void)
 	this_count = 0;
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		irq = __this_cpu_read(vector_irq[vector]);
-		if (irq >= 0) {
-			desc = irq_to_desc(irq);
-			if (!desc)
-				continue;
+		if (irq < 0)
+			continue;
+		desc = irq_to_desc(irq);
+		if (!desc)
+			continue;
 
-			/*
-			 * Protect against concurrent action removal,
-			 * affinity changes etc.
-			 */
-			raw_spin_lock(&desc->lock);
-			data = irq_desc_get_irq_data(desc);
-			cpumask_copy(&affinity_new,
-				     irq_data_get_affinity_mask(data));
-			cpumask_clear_cpu(this_cpu, &affinity_new);
-
-			/* Do not count inactive or per-cpu irqs. */
-			if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
-				raw_spin_unlock(&desc->lock);
-				continue;
-			}
+		/*
+		 * Protect against concurrent action removal, affinity
+		 * changes etc.
+		 */
+		raw_spin_lock(&desc->lock);
+		data = irq_desc_get_irq_data(desc);
+		cpumask_copy(&affinity_new, irq_data_get_affinity_mask(data));
+		cpumask_clear_cpu(this_cpu, &affinity_new);
 
+		/* Do not count inactive or per-cpu irqs. */
+		if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
 			raw_spin_unlock(&desc->lock);
-			/*
-			 * A single irq may be mapped to multiple
-			 * cpu's vector_irq[] (for example IOAPIC cluster
-			 * mode).  In this case we have two
-			 * possibilities:
-			 *
-			 * 1) the resulting affinity mask is empty; that is
-			 * this the down'd cpu is the last cpu in the irq's
-			 * affinity mask, or
-			 *
-			 * 2) the resulting affinity mask is no longer
-			 * a subset of the online cpus but the affinity
-			 * mask is not zero; that is the down'd cpu is the
-			 * last online cpu in a user set affinity mask.
-			 */
-			if (cpumask_empty(&affinity_new) ||
-			    !cpumask_subset(&affinity_new, &online_new))
-				this_count++;
+			continue;
 		}
+
+		raw_spin_unlock(&desc->lock);
+		/*
+		 * A single irq may be mapped to multiple cpu's
+		 * vector_irq[] (for example IOAPIC cluster mode).  In
+		 * this case we have two possibilities:
+		 *
+		 * 1) the resulting affinity mask is empty; that is
+		 * this the down'd cpu is the last cpu in the irq's
+		 * affinity mask, or
+		 *
+		 * 2) the resulting affinity mask is no longer a
+		 * subset of the online cpus but the affinity mask is
+		 * not zero; that is the down'd cpu is the last online
+		 * cpu in a user set affinity mask.
+		 */
+		if (cpumask_empty(&affinity_new) ||
+		    !cpumask_subset(&affinity_new, &online_new))
+			this_count++;
 	}
 
 	count = 0;
-- 
cgit v1.2.3-70-g09d2


From a782a7e46bb50822fabfeb7271605762a59c86df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 2 Aug 2015 20:38:27 +0000
Subject: x86/irq: Store irq descriptor in vector array

We can spare the irq_desc lookup in the interrupt entry code if we
store the descriptor pointer in the vector array instead the interrupt
number.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Link: http://lkml.kernel.org/r/20150802203609.717724106@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/hw_irq.h |  6 ++---
 arch/x86/include/asm/irq.h    |  4 +++-
 arch/x86/kernel/apic/vector.c | 51 ++++++++++++++++++++-----------------------
 arch/x86/kernel/irq.c         | 37 ++++++++++++++-----------------
 arch/x86/kernel/irq_32.c      |  9 ++++----
 arch/x86/kernel/irq_64.c      |  9 +++-----
 arch/x86/kernel/irqinit.c     |  4 ++--
 arch/x86/lguest/boot.c        |  4 +++-
 8 files changed, 58 insertions(+), 66 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 62bb8d23b826..1e3408e88604 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -182,10 +182,10 @@ extern char irq_entries_start[];
 #define trace_irq_entries_start irq_entries_start
 #endif
 
-#define VECTOR_UNUSED		(-1)
-#define VECTOR_RETRIGGERED	(-2)
+#define VECTOR_UNUSED		NULL
+#define VECTOR_RETRIGGERED	((void *)~0UL)
 
-typedef int vector_irq_t[NR_VECTORS];
+typedef struct irq_desc* vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
 
 #endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 8008d06581c7..881b4768644a 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,7 +36,9 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
 
 extern void (*x86_platform_ipi_callback)(void);
 extern void native_init_IRQ(void);
-extern bool handle_irq(unsigned irq, struct pt_regs *regs);
+
+struct irq_desc;
+extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
 
 extern __visible unsigned int do_IRQ(struct pt_regs *regs);
 
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 9a6d11258684..200b5a5d6b79 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -169,7 +169,7 @@ next:
 			goto next;
 
 		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
-			if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNUSED)
+			if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
 				goto next;
 		}
 		/* Found one! */
@@ -181,7 +181,7 @@ next:
 			   cpumask_intersects(d->old_domain, cpu_online_mask);
 		}
 		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
-			per_cpu(vector_irq, new_cpu)[vector] = irq;
+			per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
 		d->cfg.vector = vector;
 		cpumask_copy(d->domain, vector_cpumask);
 		err = 0;
@@ -223,8 +223,9 @@ static int assign_irq_vector_policy(int irq, int node,
 
 static void clear_irq_vector(int irq, struct apic_chip_data *data)
 {
-	int cpu, vector;
+	struct irq_desc *desc;
 	unsigned long flags;
+	int cpu, vector;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	BUG_ON(!data->cfg.vector);
@@ -241,10 +242,11 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
 		return;
 	}
 
+	desc = irq_to_desc(irq);
 	for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 		     vector++) {
-			if (per_cpu(vector_irq, cpu)[vector] != irq)
+			if (per_cpu(vector_irq, cpu)[vector] != desc)
 				continue;
 			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 			break;
@@ -402,30 +404,30 @@ int __init arch_early_irq_init(void)
 	return arch_early_ioapic_init();
 }
 
+/* Initialize vector_irq on a new cpu */
 static void __setup_vector_irq(int cpu)
 {
-	/* Initialize vector_irq on a new cpu */
-	int irq, vector;
 	struct apic_chip_data *data;
+	struct irq_desc *desc;
+	int irq, vector;
 
 	/* Mark the inuse vectors */
-	for_each_active_irq(irq) {
-		data = apic_chip_data(irq_get_irq_data(irq));
-		if (!data)
-			continue;
+	for_each_irq_desc(irq, desc) {
+		struct irq_data *idata = irq_desc_get_irq_data(desc);
 
-		if (!cpumask_test_cpu(cpu, data->domain))
+		data = apic_chip_data(idata);
+		if (!data || !cpumask_test_cpu(cpu, data->domain))
 			continue;
 		vector = data->cfg.vector;
-		per_cpu(vector_irq, cpu)[vector] = irq;
+		per_cpu(vector_irq, cpu)[vector] = desc;
 	}
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
-		irq = per_cpu(vector_irq, cpu)[vector];
-		if (irq <= VECTOR_UNUSED)
+		desc = per_cpu(vector_irq, cpu)[vector];
+		if (IS_ERR_OR_NULL(desc))
 			continue;
 
-		data = apic_chip_data(irq_get_irq_data(irq));
+		data = apic_chip_data(irq_desc_get_irq_data(desc));
 		if (!cpumask_test_cpu(cpu, data->domain))
 			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 	}
@@ -447,7 +449,7 @@ void setup_vector_irq(int cpu)
 	 * legacy vector to irq mapping:
 	 */
 	for (irq = 0; irq < nr_legacy_irqs(); irq++)
-		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
+		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
 
 	__setup_vector_irq(cpu);
 }
@@ -543,19 +545,13 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		int irq;
-		unsigned int irr;
-		struct irq_desc *desc;
 		struct apic_chip_data *data;
+		struct irq_desc *desc;
+		unsigned int irr;
 
 	retry:
-		irq = __this_cpu_read(vector_irq[vector]);
-
-		if (irq <= VECTOR_UNUSED)
-			continue;
-
-		desc = irq_to_desc(irq);
-		if (!desc)
+		desc = __this_cpu_read(vector_irq[vector]);
+		if (IS_ERR_OR_NULL(desc))
 			continue;
 
 		if (!raw_spin_trylock(&desc->lock)) {
@@ -565,9 +561,10 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 			goto retry;
 		}
 
-		data = apic_chip_data(&desc->irq_data);
+		data = apic_chip_data(irq_desc_get_irq_data(desc));
 		if (!data)
 			goto unlock;
+
 		/*
 		 * Check if the irq migration is in progress. If so, we
 		 * haven't received the cleanup request yet for this irq.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 140950fb9902..e010847583d7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -211,22 +211,21 @@ u64 arch_irq_stat(void)
 __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
-
+	struct irq_desc * desc;
 	/* high bit used in ret_from_ code  */
 	unsigned vector = ~regs->orig_ax;
-	unsigned irq;
 
 	entering_irq();
 
-	irq = __this_cpu_read(vector_irq[vector]);
+	desc = __this_cpu_read(vector_irq[vector]);
 
-	if (!handle_irq(irq, regs)) {
+	if (!handle_irq(desc, regs)) {
 		ack_APIC_irq();
 
-		if (irq != VECTOR_RETRIGGERED) {
-			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+		if (desc != VECTOR_RETRIGGERED) {
+			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
 					     __func__, smp_processor_id(),
-					     vector, irq);
+					     vector);
 		} else {
 			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 		}
@@ -330,10 +329,10 @@ static struct cpumask affinity_new, online_new;
  */
 int check_irq_vectors_for_cpu_disable(void)
 {
-	int irq, cpu;
 	unsigned int this_cpu, vector, this_count, count;
 	struct irq_desc *desc;
 	struct irq_data *data;
+	int cpu;
 
 	this_cpu = smp_processor_id();
 	cpumask_copy(&online_new, cpu_online_mask);
@@ -341,24 +340,21 @@ int check_irq_vectors_for_cpu_disable(void)
 
 	this_count = 0;
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		irq = __this_cpu_read(vector_irq[vector]);
-		if (irq < 0)
-			continue;
-		desc = irq_to_desc(irq);
-		if (!desc)
+		desc = __this_cpu_read(vector_irq[vector]);
+		if (IS_ERR_OR_NULL(desc))
 			continue;
-
 		/*
 		 * Protect against concurrent action removal, affinity
 		 * changes etc.
 		 */
 		raw_spin_lock(&desc->lock);
 		data = irq_desc_get_irq_data(desc);
-		cpumask_copy(&affinity_new, irq_data_get_affinity_mask(data));
+		cpumask_copy(&affinity_new,
+			     irq_data_get_affinity_mask(data));
 		cpumask_clear_cpu(this_cpu, &affinity_new);
 
 		/* Do not count inactive or per-cpu irqs. */
-		if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
+		if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) {
 			raw_spin_unlock(&desc->lock);
 			continue;
 		}
@@ -399,8 +395,8 @@ int check_irq_vectors_for_cpu_disable(void)
 		for (vector = FIRST_EXTERNAL_VECTOR;
 		     vector < first_system_vector; vector++) {
 			if (!test_bit(vector, used_vectors) &&
-			    per_cpu(vector_irq, cpu)[vector] <= VECTOR_UNUSED)
-				count++;
+			    IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
+			    count++;
 		}
 	}
 
@@ -504,14 +500,13 @@ void fixup_irqs(void)
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		unsigned int irr;
 
-		if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNUSED)
+		if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
 			continue;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
 		if (irr  & (1 << (vector % 32))) {
-			irq = __this_cpu_read(vector_irq[vector]);
+			desc = __this_cpu_read(vector_irq[vector]);
 
-			desc = irq_to_desc(irq);
 			raw_spin_lock(&desc->lock);
 			data = irq_desc_get_irq_data(desc);
 			chip = irq_data_get_irq_chip(data);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index cd74f5978ab9..217b01388038 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -148,21 +148,20 @@ void do_softirq_own_stack(void)
 	call_on_stack(__do_softirq, isp);
 }
 
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 {
-	struct irq_desc *desc;
+	unsigned int irq = irq_desc_get_irq(desc);
 	int overflow;
 
 	overflow = check_stack_overflow();
 
-	desc = irq_to_desc(irq);
-	if (unlikely(!desc))
+	if (IS_ERR_OR_NULL(desc))
 		return false;
 
 	if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
-		desc->handle_irq(irq, desc);
+		generic_handle_irq_desc(irq, desc);
 	}
 
 	return true;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index bc4604e500a3..ff16ccb918f2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -68,16 +68,13 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 #endif
 }
 
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 {
-	struct irq_desc *desc;
-
 	stack_overflow_check(regs);
 
-	desc = irq_to_desc(irq);
-	if (unlikely(!desc))
+	if (unlikely(IS_ERR_OR_NULL(desc)))
 		return false;
 
-	generic_handle_irq_desc(irq, desc);
+	generic_handle_irq_desc(irq_desc_get_irq(desc), desc);
 	return true;
 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 32a637ffdf98..1423ab1b0312 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNUSED)
+		if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
 			return 1;
 	}
 
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
 	 * irq's migrate etc.
 	 */
 	for (i = 0; i < nr_legacy_irqs(); i++)
-		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i;
+		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
 
 	x86_init.irqs.intr_init();
 }
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 2165f45befff..47071a08bfa6 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -843,6 +843,7 @@ static struct irq_chip lguest_irq_controller = {
  */
 static int lguest_setup_irq(unsigned int irq)
 {
+	struct irq_desc *desc;
 	int err;
 
 	/* Returns -ve error or vector number. */
@@ -858,7 +859,8 @@ static int lguest_setup_irq(unsigned int irq)
 				      handle_level_irq, "level");
 
 	/* Some systems map "vectors" to interrupts weirdly.  Not us! */
-	__this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], irq);
+	desc = irq_to_desc(irq);
+	__this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 71db87ba570038497db1227b7dc61113c4156565 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 30 Jul 2015 15:04:01 +0530
Subject: bus: subsys: update return type of ->remove_dev() to void

Its return value is not used by the subsys core and nothing meaningful
can be done with it, even if we want to use it. The subsys device is
anyway getting removed.

Update prototype of ->remove_dev() to make its return type as void. Fix
all usage sites as well.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/sh/kernel/cpu/sh4/sq.c          |  3 +--
 arch/tile/kernel/sysfs.c             | 11 ++++-------
 arch/x86/kernel/cpu/microcode/core.c |  5 ++---
 drivers/cpufreq/cpufreq.c            | 12 +++++-------
 drivers/net/rionet.c                 |  4 +---
 include/linux/device.h               |  2 +-
 6 files changed, 14 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index 0a47bd3e7bee..4ca78ed71ad2 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -355,13 +355,12 @@ static int sq_dev_add(struct device *dev, struct subsys_interface *sif)
 	return error;
 }
 
-static int sq_dev_remove(struct device *dev, struct subsys_interface *sif)
+static void sq_dev_remove(struct device *dev, struct subsys_interface *sif)
 {
 	unsigned int cpu = dev->id;
 	struct kobject *kobj = sq_kobject[cpu];
 
 	kobject_put(kobj);
-	return 0;
 }
 
 static struct subsys_interface sq_interface = {
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index a3ed12f8f83b..825867c53853 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -198,16 +198,13 @@ static int hv_stats_device_add(struct device *dev, struct subsys_interface *sif)
 	return err;
 }
 
-static int hv_stats_device_remove(struct device *dev,
-				  struct subsys_interface *sif)
+static void hv_stats_device_remove(struct device *dev,
+				   struct subsys_interface *sif)
 {
 	int cpu = dev->id;
 
-	if (!cpu_online(cpu))
-		return 0;
-
-	sysfs_remove_file(&dev->kobj, &dev_attr_hv_stats.attr);
-	return 0;
+	if (cpu_online(cpu))
+		sysfs_remove_file(&dev->kobj, &dev_attr_hv_stats.attr);
 }
 
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 6236a54a63f4..3c986390058a 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -377,17 +377,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
 	return err;
 }
 
-static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
+static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
 {
 	int cpu = dev->id;
 
 	if (!cpu_online(cpu))
-		return 0;
+		return;
 
 	pr_debug("CPU%d removed\n", cpu);
 	microcode_fini_cpu(cpu);
 	sysfs_remove_group(&dev->kobj, &mc_attr_group);
-	return 0;
 }
 
 static struct subsys_interface mc_cpu_interface = {
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 26063afb3eba..6da25c10bdfd 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1518,7 +1518,7 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
  *
  * Removes the cpufreq interface for a CPU device.
  */
-static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
+static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
 	unsigned int cpu = dev->id;
 	int ret;
@@ -1533,7 +1533,7 @@ static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 		struct cpumask mask;
 
 		if (!policy)
-			return 0;
+			return;
 
 		cpumask_copy(&mask, policy->related_cpus);
 		cpumask_clear_cpu(cpu, &mask);
@@ -1544,19 +1544,17 @@ static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 		 */
 		if (cpumask_intersects(&mask, cpu_present_mask)) {
 			remove_cpu_dev_symlink(policy, cpu);
-			return 0;
+			return;
 		}
 
 		cpufreq_policy_free(policy, true);
-		return 0;
+		return;
 	}
 
 	ret = __cpufreq_remove_dev_prepare(dev, sif);
 
 	if (!ret)
-		ret = __cpufreq_remove_dev_finish(dev, sif);
-
-	return ret;
+		__cpufreq_remove_dev_finish(dev, sif);
 }
 
 static void handle_update(struct work_struct *work)
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index dac7a0d9bb46..01f08a7751f7 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -396,7 +396,7 @@ static int rionet_close(struct net_device *ndev)
 	return 0;
 }
 
-static int rionet_remove_dev(struct device *dev, struct subsys_interface *sif)
+static void rionet_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
 	struct rio_dev *rdev = to_rio_dev(dev);
 	unsigned char netid = rdev->net->hport->id;
@@ -416,8 +416,6 @@ static int rionet_remove_dev(struct device *dev, struct subsys_interface *sif)
 			}
 		}
 	}
-
-	return 0;
 }
 
 static void rionet_get_drvinfo(struct net_device *ndev,
diff --git a/include/linux/device.h b/include/linux/device.h
index a2b4ea70a946..1225f98e9240 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -341,7 +341,7 @@ struct subsys_interface {
 	struct bus_type *subsys;
 	struct list_head node;
 	int (*add_dev)(struct device *dev, struct subsys_interface *sif);
-	int (*remove_dev)(struct device *dev, struct subsys_interface *sif);
+	void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
 };
 
 int subsys_interface_register(struct subsys_interface *sif);
-- 
cgit v1.2.3-70-g09d2


From 136d9d83c07c5e30ac49fc83b27e8c4842f108fc Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 6 Aug 2015 10:04:38 +0200
Subject: x86/ldt: Correct LDT access in single stepping logic

Commit 37868fe113ff ("x86/ldt: Make modify_ldt synchronous")
introduced a new struct ldt_struct anchored at mm->context.ldt.

convert_ip_to_linear() was changed to reflect this, but indexing
into the ldt has to be changed as the pointer is no longer void *.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: <stable@vger.kernel.org> # On top of: 37868fe113ff: x86/ldt: Make modify_ldt synchronous
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@suse.de
Link: http://lkml.kernel.org/r/1438848278-12906-1-git-send-email-jgross@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/step.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 6273324186ac..0ccb53a9fcd9 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -28,11 +28,11 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		struct desc_struct *desc;
 		unsigned long base;
 
-		seg &= ~7UL;
+		seg >>= 3;
 
 		mutex_lock(&child->mm->context.lock);
 		if (unlikely(!child->mm->context.ldt ||
-			     (seg >> 3) >= child->mm->context.ldt->size))
+			     seg >= child->mm->context.ldt->size))
 			addr = -1L; /* bogus selector, access would fault */
 		else {
 			desc = &child->mm->context.ldt->entries[seg];
-- 
cgit v1.2.3-70-g09d2


From 8eda41b0863ca82010010e951b7aa13b66a06e78 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 18 Jun 2015 16:24:22 +0530
Subject: clockevents/drivers/i8253: Migrate to new 'set-state' interface

Migrate i8253 driver to the new 'set-state' interface provided by
clockevents core, the earlier 'set-mode' interface is marked obsolete
now.

This also enables us to implement callbacks for new states of clockevent
devices, for example: ONESHOT_STOPPED.

Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 arch/x86/kernel/i8253.c     |  2 +-
 drivers/clocksource/i8253.c | 77 ++++++++++++++++++++++-----------------------
 2 files changed, 39 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index f2b96de3c7c1..efb82f07b29c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -34,7 +34,7 @@ static int __init init_pit_clocksource(void)
 	  * - when local APIC timer is active (PIT is switched off)
 	  */
 	if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-	    i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+	    !clockevent_state_periodic(&i8253_clockevent))
 		return 0;
 
 	return clocksource_i8253_init();
diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c
index 14ee3efcc404..0efd36e483ab 100644
--- a/drivers/clocksource/i8253.c
+++ b/drivers/clocksource/i8253.c
@@ -100,44 +100,40 @@ int __init clocksource_i8253_init(void)
 #endif
 
 #ifdef CONFIG_CLKEVT_I8253
-/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
-			   struct clock_event_device *evt)
+static int pit_shutdown(struct clock_event_device *evt)
 {
+	if (!clockevent_state_oneshot(evt) && !clockevent_state_periodic(evt))
+		return 0;
+
 	raw_spin_lock(&i8253_lock);
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		/* binary, mode 2, LSB/MSB, ch 0 */
-		outb_p(0x34, PIT_MODE);
-		outb_p(PIT_LATCH & 0xff , PIT_CH0);	/* LSB */
-		outb_p(PIT_LATCH >> 8 , PIT_CH0);		/* MSB */
-		break;
-
-	case CLOCK_EVT_MODE_SHUTDOWN:
-	case CLOCK_EVT_MODE_UNUSED:
-		if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
-		    evt->mode == CLOCK_EVT_MODE_ONESHOT) {
-			outb_p(0x30, PIT_MODE);
-			outb_p(0, PIT_CH0);
-			outb_p(0, PIT_CH0);
-		}
-		break;
-
-	case CLOCK_EVT_MODE_ONESHOT:
-		/* One shot setup */
-		outb_p(0x38, PIT_MODE);
-		break;
-
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here */
-		break;
-	}
+	outb_p(0x30, PIT_MODE);
+	outb_p(0, PIT_CH0);
+	outb_p(0, PIT_CH0);
+
+	raw_spin_unlock(&i8253_lock);
+	return 0;
+}
+
+static int pit_set_oneshot(struct clock_event_device *evt)
+{
+	raw_spin_lock(&i8253_lock);
+	outb_p(0x38, PIT_MODE);
+	raw_spin_unlock(&i8253_lock);
+	return 0;
+}
+
+static int pit_set_periodic(struct clock_event_device *evt)
+{
+	raw_spin_lock(&i8253_lock);
+
+	/* binary, mode 2, LSB/MSB, ch 0 */
+	outb_p(0x34, PIT_MODE);
+	outb_p(PIT_LATCH & 0xff, PIT_CH0);	/* LSB */
+	outb_p(PIT_LATCH >> 8, PIT_CH0);	/* MSB */
+
 	raw_spin_unlock(&i8253_lock);
+	return 0;
 }
 
 /*
@@ -160,10 +156,11 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
  * it can be solely used for the global tick.
  */
 struct clock_event_device i8253_clockevent = {
-	.name		= "pit",
-	.features	= CLOCK_EVT_FEAT_PERIODIC,
-	.set_mode	= init_pit_timer,
-	.set_next_event = pit_next_event,
+	.name			= "pit",
+	.features		= CLOCK_EVT_FEAT_PERIODIC,
+	.set_state_shutdown	= pit_shutdown,
+	.set_state_periodic	= pit_set_periodic,
+	.set_next_event		= pit_next_event,
 };
 
 /*
@@ -172,8 +169,10 @@ struct clock_event_device i8253_clockevent = {
  */
 void __init clockevent_i8253_init(bool oneshot)
 {
-	if (oneshot)
+	if (oneshot) {
 		i8253_clockevent.features |= CLOCK_EVT_FEAT_ONESHOT;
+		i8253_clockevent.set_state_oneshot = pit_set_oneshot;
+	}
 	/*
 	 * Start pit with the boot cpu mask. x86 might make it global
 	 * when it is used as broadcast device later.
-- 
cgit v1.2.3-70-g09d2


From dbc72b7a0c673ff00fdeb21d3a26064e2185baf4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 10 Aug 2015 14:17:34 +0200
Subject: perf/x86/intel: Fix memory leak on hot-plug allocation fail

We fail to free the shared_regs allocation if the constraint_list
allocation fails.

Cure this and be more consistent in NULL-ing the pointers after free.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index b9826a981fb2..6326ae24e4d5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2534,7 +2534,7 @@ static int intel_pmu_cpu_prepare(int cpu)
 	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
 		cpuc->shared_regs = allocate_shared_regs(cpu);
 		if (!cpuc->shared_regs)
-			return NOTIFY_BAD;
+			goto err;
 	}
 
 	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
@@ -2542,18 +2542,27 @@ static int intel_pmu_cpu_prepare(int cpu)
 
 		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
 		if (!cpuc->constraint_list)
-			return NOTIFY_BAD;
+			goto err_shared_regs;
 
 		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
-		if (!cpuc->excl_cntrs) {
-			kfree(cpuc->constraint_list);
-			kfree(cpuc->shared_regs);
-			return NOTIFY_BAD;
-		}
+		if (!cpuc->excl_cntrs)
+			goto err_constraint_list;
+
 		cpuc->excl_thread_id = 0;
 	}
 
 	return NOTIFY_OK;
+
+err_constraint_list:
+	kfree(cpuc->constraint_list);
+	cpuc->constraint_list = NULL;
+
+err_shared_regs:
+	kfree(cpuc->shared_regs);
+	cpuc->shared_regs = NULL;
+
+err:
+	return NOTIFY_BAD;
 }
 
 static void intel_pmu_cpu_starting(int cpu)
-- 
cgit v1.2.3-70-g09d2


From d7a702f0b1033cf402fef65bd6395072738f0844 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Thu, 6 Aug 2015 13:12:43 +0100
Subject: perf/x86/intel/cqm: Do not access cpu_data() from CPU_UP_PREPARE
 handler

Tony reports that booting his 144-cpu machine with maxcpus=10 triggers
the following WARN_ON():

[   21.045727] WARNING: CPU: 8 PID: 647 at arch/x86/kernel/cpu/perf_event_intel_cqm.c:1267 intel_cqm_cpu_prepare+0x75/0x90()
[   21.045744] CPU: 8 PID: 647 Comm: systemd-udevd Not tainted 4.2.0-rc4 #1
[   21.045745] Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BRHSXSD1.86B.0066.R00.1506021730 06/02/2015
[   21.045747]  0000000000000000 0000000082771b09 ffff880856333ba8 ffffffff81669b67
[   21.045748]  0000000000000000 0000000000000000 ffff880856333be8 ffffffff8107b02a
[   21.045750]  ffff88085b789800 ffff88085f68a020 ffffffff819e2470 000000000000000a
[   21.045750] Call Trace:
[   21.045757]  [<ffffffff81669b67>] dump_stack+0x45/0x57
[   21.045759]  [<ffffffff8107b02a>] warn_slowpath_common+0x8a/0xc0
[   21.045761]  [<ffffffff8107b15a>] warn_slowpath_null+0x1a/0x20
[   21.045762]  [<ffffffff81036725>] intel_cqm_cpu_prepare+0x75/0x90
[   21.045764]  [<ffffffff81036872>] intel_cqm_cpu_notifier+0x42/0x160
[   21.045767]  [<ffffffff8109a33d>] notifier_call_chain+0x4d/0x80
[   21.045769]  [<ffffffff8109a44e>] __raw_notifier_call_chain+0xe/0x10
[   21.045770]  [<ffffffff8107b538>] _cpu_up+0xe8/0x190
[   21.045771]  [<ffffffff8107b65a>] cpu_up+0x7a/0xa0
[   21.045774]  [<ffffffff8165e920>] cpu_subsys_online+0x40/0x90
[   21.045777]  [<ffffffff81433b37>] device_online+0x67/0x90
[   21.045778]  [<ffffffff81433bea>] online_store+0x8a/0xa0
[   21.045782]  [<ffffffff81430e78>] dev_attr_store+0x18/0x30
[   21.045785]  [<ffffffff8126b6ba>] sysfs_kf_write+0x3a/0x50
[   21.045786]  [<ffffffff8126ad40>] kernfs_fop_write+0x120/0x170
[   21.045789]  [<ffffffff811f0b77>] __vfs_write+0x37/0x100
[   21.045791]  [<ffffffff811f38b8>] ? __sb_start_write+0x58/0x110
[   21.045795]  [<ffffffff81296d2d>] ? security_file_permission+0x3d/0xc0
[   21.045796]  [<ffffffff811f1279>] vfs_write+0xa9/0x190
[   21.045797]  [<ffffffff811f2075>] SyS_write+0x55/0xc0
[   21.045800]  [<ffffffff81067300>] ? do_page_fault+0x30/0x80
[   21.045804]  [<ffffffff816709ae>] entry_SYSCALL_64_fastpath+0x12/0x71
[   21.045805] ---[ end trace fe228b836d8af405 ]---

The root cause is that CPU_UP_PREPARE is completely the wrong notifier
action from which to access cpu_data(), because smp_store_cpu_info()
won't have been executed by the target CPU at that point, which in turn
means that ->x86_cache_max_rmid and ->x86_cache_occ_scale haven't been
filled out.

Instead let's invoke our handler from CPU_STARTING and rename it
appropriately.

Reported-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikas Shivappa <vikas.shivappa@intel.com>
Link: http://lkml.kernel.org/r/1438863163-14083-1-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 63eb68b73589..377e8f8ed391 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -1255,7 +1255,7 @@ static inline void cqm_pick_event_reader(int cpu)
 	cpumask_set_cpu(cpu, &cqm_cpumask);
 }
 
-static void intel_cqm_cpu_prepare(unsigned int cpu)
+static void intel_cqm_cpu_starting(unsigned int cpu)
 {
 	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -1296,13 +1296,11 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
 	unsigned int cpu  = (unsigned long)hcpu;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		intel_cqm_cpu_prepare(cpu);
-		break;
 	case CPU_DOWN_PREPARE:
 		intel_cqm_cpu_exit(cpu);
 		break;
 	case CPU_STARTING:
+		intel_cqm_cpu_starting(cpu);
 		cqm_pick_event_reader(cpu);
 		break;
 	}
@@ -1373,7 +1371,7 @@ static int __init intel_cqm_init(void)
 		goto out;
 
 	for_each_online_cpu(i) {
-		intel_cqm_cpu_prepare(i);
+		intel_cqm_cpu_starting(i);
 		cqm_pick_event_reader(i);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 19b3340cf58d14decf2898fc795cc2b1fa49e79e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 6 Aug 2015 17:26:58 +0200
Subject: perf/x86: Fix MSR PMU driver

Currently we only update the sysfs event files per available MSR, we
didn't actually disallow creating unlisted events.

Rework things such that the dectection, sysfs listing and event
creation are better coordinated.

Sadly it appears it's impossible to probe R/O MSRs under virt. This
means we have to do the full model table to avoid listing all MSRs all
the time.

Tested-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_msr.c | 168 +++++++++++++++++------------------
 1 file changed, 84 insertions(+), 84 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
index af216e9223e8..b0dd2e8a6d12 100644
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -10,17 +10,63 @@ enum perf_msr_id {
 	PERF_MSR_EVENT_MAX,
 };
 
+bool test_aperfmperf(int idx)
+{
+	return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+bool test_intel(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_MSR_SMI)
+			return true;
+		break;
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
 struct perf_msr {
-	int	id;
 	u64	msr;
-};
-
-static struct perf_msr msr[] = {
-	{ PERF_MSR_TSC, 0 },
-	{ PERF_MSR_APERF, MSR_IA32_APERF },
-	{ PERF_MSR_MPERF, MSR_IA32_MPERF },
-	{ PERF_MSR_PPERF, MSR_PPERF },
-	{ PERF_MSR_SMI, MSR_SMI_COUNT },
+	struct	perf_pmu_events_attr *attr;
+	bool	(*test)(int idx);
 };
 
 PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
@@ -29,8 +75,16 @@ PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
 PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
 PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
 
+static struct perf_msr msr[] = {
+	[PERF_MSR_TSC]   = { 0,			&evattr_tsc,	NULL,		 },
+	[PERF_MSR_APERF] = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
+	[PERF_MSR_MPERF] = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
+	[PERF_MSR_PPERF] = { MSR_PPERF,		&evattr_pperf,	test_intel,	 },
+	[PERF_MSR_SMI]   = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
+};
+
 static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
-	&evattr_tsc.attr.attr,
+	NULL,
 };
 
 static struct attribute_group events_attr_group = {
@@ -74,6 +128,9 @@ static int msr_event_init(struct perf_event *event)
 	    event->attr.sample_period) /* no sampling */
 		return -EINVAL;
 
+	if (!msr[cfg].attr)
+		return -EINVAL;
+
 	event->hw.idx = -1;
 	event->hw.event_base = msr[cfg].msr;
 	event->hw.config = cfg;
@@ -151,89 +208,32 @@ static struct pmu pmu_msr = {
 	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
 };
 
-static int __init intel_msr_init(int idx)
-{
-	if (boot_cpu_data.x86 != 6)
-		return 0;
-
-	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-		events_attrs[idx++] = &evattr_smi.attr.attr;
-		break;
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		events_attrs[idx++] = &evattr_pperf.attr.attr;
-		events_attrs[idx++] = &evattr_smi.attr.attr;
-		break;
-
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 76: /* 14nm Atom "Airmont"                   */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-		events_attrs[idx++] = &evattr_smi.attr.attr;
-		break;
-	}
-
-	events_attrs[idx] = NULL;
-
-	return 0;
-}
-
-static int __init amd_msr_init(int idx)
-{
-	return 0;
-}
-
 static int __init msr_init(void)
 {
-	int err;
-	int idx = 1;
+	int i, j = 0;
 
-	if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
-		events_attrs[idx++] = &evattr_aperf.attr.attr;
-		events_attrs[idx++] = &evattr_mperf.attr.attr;
-		events_attrs[idx] = NULL;
+	if (!boot_cpu_has(X86_FEATURE_TSC)) {
+		pr_cont("no MSR PMU driver.\n");
+		return 0;
 	}
 
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_INTEL:
-		err = intel_msr_init(idx);
-		break;
-
-	case X86_VENDOR_AMD:
-		err = amd_msr_init(idx);
-		break;
+	/* Probe the MSRs. */
+	for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+		u64 val;
 
-	default:
-		err = -ENOTSUPP;
+		/*
+		 * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+		 */
+		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+			msr[i].attr = NULL;
 	}
 
-	if (err != 0) {
-		pr_cont("no msr PMU driver.\n");
-		return 0;
+	/* List remaining MSRs in the sysfs attrs. */
+	for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+		if (msr[i].attr)
+			events_attrs[j++] = &msr[i].attr->attr.attr;
 	}
+	events_attrs[j] = NULL;
 
 	perf_pmu_register(&pmu_msr, "msr", -1);
 
-- 
cgit v1.2.3-70-g09d2


From 709bc871923c12b284424f9d47b99dc975ba8b29 Mon Sep 17 00:00:00 2001
From: Takao Indoh <indou.takao@jp.fujitsu.com>
Date: Tue, 4 Aug 2015 18:36:55 +0900
Subject: perf/x86/intel/pt: Clean up files of Intel Processor Trace

This patch just cleans up some files of Intel Processor Trace, does not
change its behavior. This patch removes unused definitions and replaces a
constant value with a macro.

Signed-off-by: Takao Indoh <indou.takao@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin<alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H.Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438681015-5124-1-git-send-email-indou.takao@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/intel_pt.h            | 33 ++++++-------------------------
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 10 +++++-----
 2 files changed, 11 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
index feb293e96531..336878a5d205 100644
--- a/arch/x86/kernel/cpu/intel_pt.h
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -25,32 +25,11 @@
  */
 #define TOPA_PMI_MARGIN 512
 
-/*
- * Table of Physical Addresses bits
- */
-enum topa_sz {
-	TOPA_4K	= 0,
-	TOPA_8K,
-	TOPA_16K,
-	TOPA_32K,
-	TOPA_64K,
-	TOPA_128K,
-	TOPA_256K,
-	TOPA_512K,
-	TOPA_1MB,
-	TOPA_2MB,
-	TOPA_4MB,
-	TOPA_8MB,
-	TOPA_16MB,
-	TOPA_32MB,
-	TOPA_64MB,
-	TOPA_128MB,
-	TOPA_SZ_END,
-};
+#define TOPA_SHIFT 12
 
-static inline unsigned int sizes(enum topa_sz tsz)
+static inline unsigned int sizes(unsigned int tsz)
 {
-	return 1 << (tsz + 12);
+	return 1 << (tsz + TOPA_SHIFT);
 };
 
 struct topa_entry {
@@ -66,8 +45,8 @@ struct topa_entry {
 	u64	rsvd4	: 16;
 };
 
-#define TOPA_SHIFT 12
-#define PT_CPUID_LEAVES 2
+#define PT_CPUID_LEAVES		2
+#define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */
 
 enum pt_capabilities {
 	PT_CAP_max_subleaf = 0,
@@ -85,7 +64,7 @@ enum pt_capabilities {
 
 struct pt_pmu {
 	struct pmu		pmu;
-	u32			caps[4 * PT_CPUID_LEAVES];
+	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
 };
 
 /**
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index e20cfacb5a32..42169283448b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -79,7 +79,7 @@ static struct pt_cap_desc {
 static u32 pt_cap_get(enum pt_capabilities cap)
 {
 	struct pt_cap_desc *cd = &pt_caps[cap];
-	u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
 	unsigned int shift = __ffs(cd->mask);
 
 	return (c & cd->mask) >> shift;
@@ -145,10 +145,10 @@ static int __init pt_pmu_hw_init(void)
 
 	for (i = 0; i < PT_CPUID_LEAVES; i++) {
 		cpuid_count(20, i,
-			    &pt_pmu.caps[CR_EAX + i*4],
-			    &pt_pmu.caps[CR_EBX + i*4],
-			    &pt_pmu.caps[CR_ECX + i*4],
-			    &pt_pmu.caps[CR_EDX + i*4]);
+			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
 	}
 
 	ret = -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From 99db44350672c8a5ee9a7b0a6f4cd6ff10136065 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 5 Aug 2015 15:22:27 +0100
Subject: PKCS#7: Appropriately restrict authenticated attributes and content
 type

A PKCS#7 or CMS message can have per-signature authenticated attributes
that are digested as a lump and signed by the authorising key for that
signature.  If such attributes exist, the content digest isn't itself
signed, but rather it is included in a special authattr which then
contributes to the signature.

Further, we already require the master message content type to be
pkcs7_signedData - but there's also a separate content type for the data
itself within the SignedData object and this must be repeated inside the
authattrs for each signer [RFC2315 9.2, RFC5652 11.1].

We should really validate the authattrs if they exist or forbid them
entirely as appropriate.  To this end:

 (1) Alter the PKCS#7 parser to reject any message that has more than one
     signature where at least one signature has authattrs and at least one
     that does not.

 (2) Validate authattrs if they are present and strongly restrict them.
     Only the following authattrs are permitted and all others are
     rejected:

     (a) contentType.  This is checked to be an OID that matches the
     	 content type in the SignedData object.

     (b) messageDigest.  This must match the crypto digest of the data.

     (c) signingTime.  If present, we check that this is a valid, parseable
     	 UTCTime or GeneralTime and that the date it encodes fits within
     	 the validity window of the matching X.509 cert.

     (d) S/MIME capabilities.  We don't check the contents.

     (e) Authenticode SP Opus Info.  We don't check the contents.

     (f) Authenticode Statement Type.  We don't check the contents.

     The message is rejected if (a) or (b) are missing.  If the message is
     an Authenticode type, the message is rejected if (e) is missing; if
     not Authenticode, the message is rejected if (d) - (f) are present.

     The S/MIME capabilities authattr (d) unfortunately has to be allowed
     to support kernels already signed by the pesign program.  This only
     affects kexec.  sign-file suppresses them (CMS_NOSMIMECAP).

     The message is also rejected if an authattr is given more than once or
     if it contains more than one element in its set of values.

 (3) Add a parameter to pkcs7_verify() to select one of the following
     restrictions and pass in the appropriate option from the callers:

     (*) VERIFYING_MODULE_SIGNATURE

	 This requires that the SignedData content type be pkcs7-data and
	 forbids authattrs.  sign-file sets CMS_NOATTR.  We could be more
	 flexible and permit authattrs optionally, but only permit minimal
	 content.

     (*) VERIFYING_FIRMWARE_SIGNATURE

	 This requires that the SignedData content type be pkcs7-data and
	 requires authattrs.  In future, this will require an attribute
	 holding the target firmware name in addition to the minimal set.

     (*) VERIFYING_UNSPECIFIED_SIGNATURE

	 This requires that the SignedData content type be pkcs7-data but
	 allows either no authattrs or only permits the minimal set.

     (*) VERIFYING_KEXEC_PE_SIGNATURE

	 This only supports the Authenticode SPC_INDIRECT_DATA content type
	 and requires at least an SpcSpOpusInfo authattr in addition to the
	 minimal set.  It also permits an SPC_STATEMENT_TYPE authattr (and
	 an S/MIME capabilities authattr because the pesign program doesn't
	 remove these).

     (*) VERIFYING_KEY_SIGNATURE
     (*) VERIFYING_KEY_SELF_SIGNATURE

	 These are invalid in this context but are included for later use
	 when limiting the use of X.509 certs.

 (4) The pkcs7_test key type is given a module parameter to select between
     the above options for testing purposes.  For example:

	echo 1 >/sys/module/pkcs7_test_key/parameters/usage
	keyctl padd pkcs7_test foo @s </tmp/stuff.pkcs7

     will attempt to check the signature on stuff.pkcs7 as if it contains a
     firmware blob (1 being VERIFYING_FIRMWARE_SIGNATURE).

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marcel Holtmann <marcel@holtmann.org>
Reviewed-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/kexec-bzimage64.c        |   4 +-
 crypto/asymmetric_keys/asymmetric_type.c |  11 +++
 crypto/asymmetric_keys/pkcs7.asn1        |   6 +-
 crypto/asymmetric_keys/pkcs7_key_type.c  |  14 +++-
 crypto/asymmetric_keys/pkcs7_parser.c    | 138 +++++++++++++++++++++++++++++--
 crypto/asymmetric_keys/pkcs7_parser.h    |  15 +++-
 crypto/asymmetric_keys/pkcs7_verify.c    |  65 ++++++++++++++-
 crypto/asymmetric_keys/verify_pefile.c   |   7 +-
 include/crypto/pkcs7.h                   |  10 ++-
 include/crypto/public_key.h              |  14 ++++
 include/keys/system_keyring.h            |   4 +-
 include/linux/oid_registry.h             |   4 +-
 include/linux/verify_pefile.h            |   6 +-
 kernel/module_signing.c                  |   3 +-
 kernel/system_keyring.c                  |   6 +-
 scripts/sign-file.c                      |   5 +-
 16 files changed, 285 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ca83f7ac388b..fab22e72808c 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -536,7 +536,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
 	int ret;
 
 	ret = verify_pefile_signature(kernel, kernel_len,
-				      system_trusted_keyring, &trusted);
+				      system_trusted_keyring,
+				      VERIFYING_KEXEC_PE_SIGNATURE,
+				      &trusted);
 	if (ret < 0)
 		return ret;
 	if (!trusted)
diff --git a/crypto/asymmetric_keys/asymmetric_type.c b/crypto/asymmetric_keys/asymmetric_type.c
index b0e4ed23d668..1916680ad81b 100644
--- a/crypto/asymmetric_keys/asymmetric_type.c
+++ b/crypto/asymmetric_keys/asymmetric_type.c
@@ -12,6 +12,7 @@
  */
 #include <keys/asymmetric-subtype.h>
 #include <keys/asymmetric-parser.h>
+#include <crypto/public_key.h>
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -20,6 +21,16 @@
 
 MODULE_LICENSE("GPL");
 
+const char *const key_being_used_for[NR__KEY_BEING_USED_FOR] = {
+	[VERIFYING_MODULE_SIGNATURE]		= "mod sig",
+	[VERIFYING_FIRMWARE_SIGNATURE]		= "firmware sig",
+	[VERIFYING_KEXEC_PE_SIGNATURE]		= "kexec PE sig",
+	[VERIFYING_KEY_SIGNATURE]		= "key sig",
+	[VERIFYING_KEY_SELF_SIGNATURE]		= "key self sig",
+	[VERIFYING_UNSPECIFIED_SIGNATURE]	= "unspec sig",
+};
+EXPORT_SYMBOL_GPL(key_being_used_for);
+
 static LIST_HEAD(asymmetric_key_parsers);
 static DECLARE_RWSEM(asymmetric_key_parsers_sem);
 
diff --git a/crypto/asymmetric_keys/pkcs7.asn1 b/crypto/asymmetric_keys/pkcs7.asn1
index 6bf8ff4f7414..1eca740b816a 100644
--- a/crypto/asymmetric_keys/pkcs7.asn1
+++ b/crypto/asymmetric_keys/pkcs7.asn1
@@ -8,7 +8,7 @@ ContentType ::= OBJECT IDENTIFIER ({ pkcs7_note_OID })
 SignedData ::= SEQUENCE {
 	version			INTEGER ({ pkcs7_note_signeddata_version }),
 	digestAlgorithms	DigestAlgorithmIdentifiers,
-	contentInfo		ContentInfo,
+	contentInfo		ContentInfo ({ pkcs7_note_content }),
 	certificates		CHOICE {
 		certSet		[0] IMPLICIT ExtendedCertificatesAndCertificates,
 		certSequence	[2] IMPLICIT Certificates
@@ -21,7 +21,7 @@ SignedData ::= SEQUENCE {
 }
 
 ContentInfo ::= SEQUENCE {
-	contentType	ContentType,
+	contentType	ContentType ({ pkcs7_note_OID }),
 	content		[0] EXPLICIT Data OPTIONAL
 }
 
@@ -111,7 +111,7 @@ AuthenticatedAttribute ::= SEQUENCE {
 }
 
 UnauthenticatedAttribute ::= SEQUENCE {
-	type			OBJECT IDENTIFIER ({ pkcs7_note_OID }),
+	type			OBJECT IDENTIFIER,
 	values			SET OF ANY
 }
 
diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c
index 3d13b042da73..10d34dbd00b9 100644
--- a/crypto/asymmetric_keys/pkcs7_key_type.c
+++ b/crypto/asymmetric_keys/pkcs7_key_type.c
@@ -14,16 +14,23 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/key-type.h>
+#include <keys/asymmetric-type.h>
 #include <crypto/pkcs7.h>
 #include <keys/user-type.h>
 #include <keys/system_keyring.h>
 #include "pkcs7_parser.h"
 
+static unsigned pkcs7_usage;
+module_param_named(usage, pkcs7_usage, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(pkcs7_usage,
+		 "Usage to specify when verifying the PKCS#7 message");
+
 /*
  * Preparse a PKCS#7 wrapped and validated data blob.
  */
 static int pkcs7_preparse(struct key_preparsed_payload *prep)
 {
+	enum key_being_used_for usage = pkcs7_usage;
 	struct pkcs7_message *pkcs7;
 	const void *data, *saved_prep_data;
 	size_t datalen, saved_prep_datalen;
@@ -32,6 +39,11 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
 
 	kenter("");
 
+	if (usage >= NR__KEY_BEING_USED_FOR) {
+		pr_err("Invalid usage type %d\n", usage);
+		return -EINVAL;
+	}
+
 	saved_prep_data = prep->data;
 	saved_prep_datalen = prep->datalen;
 	pkcs7 = pkcs7_parse_message(saved_prep_data, saved_prep_datalen);
@@ -40,7 +52,7 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
 		goto error;
 	}
 
-	ret = pkcs7_verify(pkcs7);
+	ret = pkcs7_verify(pkcs7, usage);
 	if (ret < 0)
 		goto error_free;
 
diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c
index 826e2f3f507b..e6298b7a945a 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.c
+++ b/crypto/asymmetric_keys/pkcs7_parser.c
@@ -81,6 +81,30 @@ void pkcs7_free_message(struct pkcs7_message *pkcs7)
 }
 EXPORT_SYMBOL_GPL(pkcs7_free_message);
 
+/*
+ * Check authenticatedAttributes are provided or not provided consistently.
+ */
+static int pkcs7_check_authattrs(struct pkcs7_message *msg)
+{
+	struct pkcs7_signed_info *sinfo;
+	bool want;
+
+	sinfo = msg->signed_infos;
+	if (sinfo->authattrs) {
+		want = true;
+		msg->have_authattrs = true;
+	}
+
+	for (sinfo = sinfo->next; sinfo; sinfo = sinfo->next)
+		if (!!sinfo->authattrs != want)
+			goto inconsistent;
+	return 0;
+
+inconsistent:
+	pr_warn("Inconsistently supplied authAttrs\n");
+	return -EINVAL;
+}
+
 /**
  * pkcs7_parse_message - Parse a PKCS#7 message
  * @data: The raw binary ASN.1 encoded message to be parsed
@@ -113,6 +137,10 @@ struct pkcs7_message *pkcs7_parse_message(const void *data, size_t datalen)
 		goto out;
 	}
 
+	ret = pkcs7_check_authattrs(ctx->msg);
+	if (ret < 0)
+		goto out;
+
 	msg = ctx->msg;
 	ctx->msg = NULL;
 
@@ -380,6 +408,25 @@ int pkcs7_note_certificate_list(void *context, size_t hdrlen,
 	return 0;
 }
 
+/*
+ * Note the content type.
+ */
+int pkcs7_note_content(void *context, size_t hdrlen,
+		       unsigned char tag,
+		       const void *value, size_t vlen)
+{
+	struct pkcs7_parse_context *ctx = context;
+
+	if (ctx->last_oid != OID_data &&
+	    ctx->last_oid != OID_msIndirectData) {
+		pr_warn("Unsupported data type %d\n", ctx->last_oid);
+		return -EINVAL;
+	}
+
+	ctx->msg->data_type = ctx->last_oid;
+	return 0;
+}
+
 /*
  * Extract the data from the message and store that and its content type OID in
  * the context.
@@ -395,31 +442,90 @@ int pkcs7_note_data(void *context, size_t hdrlen,
 	ctx->msg->data = value;
 	ctx->msg->data_len = vlen;
 	ctx->msg->data_hdrlen = hdrlen;
-	ctx->msg->data_type = ctx->last_oid;
 	return 0;
 }
 
 /*
- * Parse authenticated attributes
+ * Parse authenticated attributes.
  */
 int pkcs7_sig_note_authenticated_attr(void *context, size_t hdrlen,
 				      unsigned char tag,
 				      const void *value, size_t vlen)
 {
 	struct pkcs7_parse_context *ctx = context;
+	struct pkcs7_signed_info *sinfo = ctx->sinfo;
+	enum OID content_type;
 
 	pr_devel("AuthAttr: %02x %zu [%*ph]\n", tag, vlen, (unsigned)vlen, value);
 
 	switch (ctx->last_oid) {
+	case OID_contentType:
+		if (__test_and_set_bit(sinfo_has_content_type, &sinfo->aa_set))
+			goto repeated;
+		content_type = look_up_OID(value, vlen);
+		if (content_type != ctx->msg->data_type) {
+			pr_warn("Mismatch between global data type (%d) and sinfo %u (%d)\n",
+				ctx->msg->data_type, sinfo->index,
+				content_type);
+			return -EBADMSG;
+		}
+		return 0;
+
+	case OID_signingTime:
+		if (__test_and_set_bit(sinfo_has_signing_time, &sinfo->aa_set))
+			goto repeated;
+		/* Should we check that the signing time is consistent
+		 * with the signer's X.509 cert?
+		 */
+		return x509_decode_time(&sinfo->signing_time,
+					hdrlen, tag, value, vlen);
+
 	case OID_messageDigest:
+		if (__test_and_set_bit(sinfo_has_message_digest, &sinfo->aa_set))
+			goto repeated;
 		if (tag != ASN1_OTS)
 			return -EBADMSG;
-		ctx->sinfo->msgdigest = value;
-		ctx->sinfo->msgdigest_len = vlen;
+		sinfo->msgdigest = value;
+		sinfo->msgdigest_len = vlen;
+		return 0;
+
+	case OID_smimeCapabilites:
+		if (__test_and_set_bit(sinfo_has_smime_caps, &sinfo->aa_set))
+			goto repeated;
+		if (ctx->msg->data_type != OID_msIndirectData) {
+			pr_warn("S/MIME Caps only allowed with Authenticode\n");
+			return -EKEYREJECTED;
+		}
+		return 0;
+
+		/* Microsoft SpOpusInfo seems to be contain cont[0] 16-bit BE
+		 * char URLs and cont[1] 8-bit char URLs.
+		 *
+		 * Microsoft StatementType seems to contain a list of OIDs that
+		 * are also used as extendedKeyUsage types in X.509 certs.
+		 */
+	case OID_msSpOpusInfo:
+		if (__test_and_set_bit(sinfo_has_ms_opus_info, &sinfo->aa_set))
+			goto repeated;
+		goto authenticode_check;
+	case OID_msStatementType:
+		if (__test_and_set_bit(sinfo_has_ms_statement_type, &sinfo->aa_set))
+			goto repeated;
+	authenticode_check:
+		if (ctx->msg->data_type != OID_msIndirectData) {
+			pr_warn("Authenticode AuthAttrs only allowed with Authenticode\n");
+			return -EKEYREJECTED;
+		}
+		/* I'm not sure how to validate these */
 		return 0;
 	default:
 		return 0;
 	}
+
+repeated:
+	/* We permit max one item per AuthenticatedAttribute and no repeats */
+	pr_warn("Repeated/multivalue AuthAttrs not permitted\n");
+	return -EKEYREJECTED;
 }
 
 /*
@@ -430,10 +536,25 @@ int pkcs7_sig_note_set_of_authattrs(void *context, size_t hdrlen,
 				    const void *value, size_t vlen)
 {
 	struct pkcs7_parse_context *ctx = context;
+	struct pkcs7_signed_info *sinfo = ctx->sinfo;
+
+	if (!test_bit(sinfo_has_content_type, &sinfo->aa_set) ||
+	    !test_bit(sinfo_has_message_digest, &sinfo->aa_set) ||
+	    (ctx->msg->data_type == OID_msIndirectData &&
+	     !test_bit(sinfo_has_ms_opus_info, &sinfo->aa_set))) {
+		pr_warn("Missing required AuthAttr\n");
+		return -EBADMSG;
+	}
+
+	if (ctx->msg->data_type != OID_msIndirectData &&
+	    test_bit(sinfo_has_ms_opus_info, &sinfo->aa_set)) {
+		pr_warn("Unexpected Authenticode AuthAttr\n");
+		return -EBADMSG;
+	}
 
 	/* We need to switch the 'CONT 0' to a 'SET OF' when we digest */
-	ctx->sinfo->authattrs = value - (hdrlen - 1);
-	ctx->sinfo->authattrs_len = vlen + (hdrlen - 1);
+	sinfo->authattrs = value - (hdrlen - 1);
+	sinfo->authattrs_len = vlen + (hdrlen - 1);
 	return 0;
 }
 
@@ -511,6 +632,11 @@ int pkcs7_note_signed_info(void *context, size_t hdrlen,
 	struct pkcs7_signed_info *sinfo = ctx->sinfo;
 	struct asymmetric_key_id *kid;
 
+	if (ctx->msg->data_type == OID_msIndirectData && !sinfo->authattrs) {
+		pr_warn("Authenticode requires AuthAttrs\n");
+		return -EBADMSG;
+	}
+
 	/* Generate cert issuer + serial number key ID */
 	if (!ctx->expect_skid) {
 		kid = asymmetric_key_generate_id(ctx->raw_serial,
diff --git a/crypto/asymmetric_keys/pkcs7_parser.h b/crypto/asymmetric_keys/pkcs7_parser.h
index 790dd7cec82c..a66b19ebcf47 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.h
+++ b/crypto/asymmetric_keys/pkcs7_parser.h
@@ -21,9 +21,9 @@
 struct pkcs7_signed_info {
 	struct pkcs7_signed_info *next;
 	struct x509_certificate *signer; /* Signing certificate (in msg->certs) */
-	unsigned index;
-	bool trusted;
-	bool unsupported_crypto;	/* T if not usable due to missing crypto */
+	unsigned	index;
+	bool		trusted;
+	bool		unsupported_crypto;	/* T if not usable due to missing crypto */
 
 	/* Message digest - the digest of the Content Data (or NULL) */
 	const void	*msgdigest;
@@ -32,6 +32,14 @@ struct pkcs7_signed_info {
 	/* Authenticated Attribute data (or NULL) */
 	unsigned	authattrs_len;
 	const void	*authattrs;
+	unsigned long	aa_set;
+#define	sinfo_has_content_type		0
+#define	sinfo_has_signing_time		1
+#define	sinfo_has_message_digest	2
+#define sinfo_has_smime_caps		3
+#define	sinfo_has_ms_opus_info		4
+#define	sinfo_has_ms_statement_type	5
+	time64_t	signing_time;
 
 	/* Issuing cert serial number and issuer's name [PKCS#7 or CMS ver 1]
 	 * or issuing cert's SKID [CMS ver 3].
@@ -53,6 +61,7 @@ struct pkcs7_message {
 	struct x509_certificate *crl;	/* Revocation list */
 	struct pkcs7_signed_info *signed_infos;
 	u8		version;	/* Version of cert (1 -> PKCS#7 or CMS; 3 -> CMS) */
+	bool		have_authattrs;	/* T if have authattrs */
 
 	/* Content Data (or NULL) */
 	enum OID	data_type;	/* Type of Data */
diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c
index 404f89a0f852..d20c0b4b880e 100644
--- a/crypto/asymmetric_keys/pkcs7_verify.c
+++ b/crypto/asymmetric_keys/pkcs7_verify.c
@@ -70,9 +70,15 @@ static int pkcs7_digest(struct pkcs7_message *pkcs7,
 	 * message digest attribute amongst them which corresponds to the
 	 * digest we just calculated.
 	 */
-	if (sinfo->msgdigest) {
+	if (sinfo->authattrs) {
 		u8 tag;
 
+		if (!sinfo->msgdigest) {
+			pr_warn("Sig %u: No messageDigest\n", sinfo->index);
+			ret = -EKEYREJECTED;
+			goto error;
+		}
+
 		if (sinfo->msgdigest_len != sinfo->sig.digest_size) {
 			pr_debug("Sig %u: Invalid digest size (%u)\n",
 				 sinfo->index, sinfo->msgdigest_len);
@@ -314,6 +320,18 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
 	pr_devel("Using X.509[%u] for sig %u\n",
 		 sinfo->signer->index, sinfo->index);
 
+	/* Check that the PKCS#7 signing time is valid according to the X.509
+	 * certificate.  We can't, however, check against the system clock
+	 * since that may not have been set yet and may be wrong.
+	 */
+	if (test_bit(sinfo_has_signing_time, &sinfo->aa_set)) {
+		if (sinfo->signing_time < sinfo->signer->valid_from ||
+		    sinfo->signing_time > sinfo->signer->valid_to) {
+			pr_warn("Message signed outside of X.509 validity window\n");
+			return -EKEYREJECTED;
+		}
+	}
+
 	/* Verify the PKCS#7 binary against the key */
 	ret = public_key_verify_signature(sinfo->signer->pub, &sinfo->sig);
 	if (ret < 0)
@@ -328,6 +346,7 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
 /**
  * pkcs7_verify - Verify a PKCS#7 message
  * @pkcs7: The PKCS#7 message to be verified
+ * @usage: The use to which the key is being put
  *
  * Verify a PKCS#7 message is internally consistent - that is, the data digest
  * matches the digest in the AuthAttrs and any signature in the message or one
@@ -339,6 +358,9 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
  *
  * Returns, in order of descending priority:
  *
+ *  (*) -EKEYREJECTED if a key was selected that had a usage restriction at
+ *      odds with the specified usage, or:
+ *
  *  (*) -EKEYREJECTED if a signature failed to match for which we found an
  *	appropriate X.509 certificate, or:
  *
@@ -350,7 +372,8 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
  *  (*) 0 if all the signature chains that don't incur -ENOPKG can be verified
  *	(note that a signature chain may be of zero length), or:
  */
-int pkcs7_verify(struct pkcs7_message *pkcs7)
+int pkcs7_verify(struct pkcs7_message *pkcs7,
+		 enum key_being_used_for usage)
 {
 	struct pkcs7_signed_info *sinfo;
 	struct x509_certificate *x509;
@@ -359,6 +382,44 @@ int pkcs7_verify(struct pkcs7_message *pkcs7)
 
 	kenter("");
 
+	switch (usage) {
+	case VERIFYING_MODULE_SIGNATURE:
+		if (pkcs7->data_type != OID_data) {
+			pr_warn("Invalid module sig (not pkcs7-data)\n");
+			return -EKEYREJECTED;
+		}
+		if (pkcs7->have_authattrs) {
+			pr_warn("Invalid module sig (has authattrs)\n");
+			return -EKEYREJECTED;
+		}
+		break;
+	case VERIFYING_FIRMWARE_SIGNATURE:
+		if (pkcs7->data_type != OID_data) {
+			pr_warn("Invalid firmware sig (not pkcs7-data)\n");
+			return -EKEYREJECTED;
+		}
+		if (!pkcs7->have_authattrs) {
+			pr_warn("Invalid firmware sig (missing authattrs)\n");
+			return -EKEYREJECTED;
+		}
+		break;
+	case VERIFYING_KEXEC_PE_SIGNATURE:
+		if (pkcs7->data_type != OID_msIndirectData) {
+			pr_warn("Invalid kexec sig (not Authenticode)\n");
+			return -EKEYREJECTED;
+		}
+		/* Authattr presence checked in parser */
+		break;
+	case VERIFYING_UNSPECIFIED_SIGNATURE:
+		if (pkcs7->data_type != OID_data) {
+			pr_warn("Invalid unspecified sig (not pkcs7-data)\n");
+			return -EKEYREJECTED;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	for (n = 0, x509 = pkcs7->certs; x509; x509 = x509->next, n++) {
 		ret = x509_get_sig_params(x509);
 		if (ret < 0)
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 2421f46184ce..897b734dabf9 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -393,6 +393,7 @@ error_no_desc:
  * @pebuf: Buffer containing the PE binary image
  * @pelen: Length of the binary image
  * @trust_keyring: Signing certificates to use as starting points
+ * @usage: The use to which the key is being put.
  * @_trusted: Set to true if trustworth, false otherwise
  *
  * Validate that the certificate chain inside the PKCS#7 message inside the PE
@@ -417,7 +418,9 @@ error_no_desc:
  * May also return -ENOMEM.
  */
 int verify_pefile_signature(const void *pebuf, unsigned pelen,
-			    struct key *trusted_keyring, bool *_trusted)
+			    struct key *trusted_keyring,
+			    enum key_being_used_for usage,
+			    bool *_trusted)
 {
 	struct pkcs7_message *pkcs7;
 	struct pefile_context ctx;
@@ -462,7 +465,7 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 	if (ret < 0)
 		goto error;
 
-	ret = pkcs7_verify(pkcs7);
+	ret = pkcs7_verify(pkcs7, usage);
 	if (ret < 0)
 		goto error;
 
diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h
index e235ab4957ee..441aff9b5aa7 100644
--- a/include/crypto/pkcs7.h
+++ b/include/crypto/pkcs7.h
@@ -9,6 +9,11 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
+#ifndef _CRYPTO_PKCS7_H
+#define _CRYPTO_PKCS7_H
+
+#include <crypto/public_key.h>
+
 struct key;
 struct pkcs7_message;
 
@@ -33,7 +38,10 @@ extern int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
 /*
  * pkcs7_verify.c
  */
-extern int pkcs7_verify(struct pkcs7_message *pkcs7);
+extern int pkcs7_verify(struct pkcs7_message *pkcs7,
+			enum key_being_used_for usage);
 
 extern int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7,
 				      const void *data, size_t datalen);
+
+#endif /* _CRYPTO_PKCS7_H */
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index fda097e079a4..067c242b1e15 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -39,6 +39,20 @@ enum pkey_id_type {
 
 extern const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST];
 
+/*
+ * The use to which an asymmetric key is being put.
+ */
+enum key_being_used_for {
+	VERIFYING_MODULE_SIGNATURE,
+	VERIFYING_FIRMWARE_SIGNATURE,
+	VERIFYING_KEXEC_PE_SIGNATURE,
+	VERIFYING_KEY_SIGNATURE,
+	VERIFYING_KEY_SELF_SIGNATURE,
+	VERIFYING_UNSPECIFIED_SIGNATURE,
+	NR__KEY_BEING_USED_FOR
+};
+extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
+
 /*
  * Cryptographic data for the public-key subtype of the asymmetric key type.
  *
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index 9791c907cdb7..b20cd885c1fd 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -15,6 +15,7 @@
 #ifdef CONFIG_SYSTEM_TRUSTED_KEYRING
 
 #include <linux/key.h>
+#include <crypto/public_key.h>
 
 extern struct key *system_trusted_keyring;
 static inline struct key *get_system_trusted_keyring(void)
@@ -30,7 +31,8 @@ static inline struct key *get_system_trusted_keyring(void)
 
 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
 extern int system_verify_data(const void *data, unsigned long len,
-			      const void *raw_pkcs7, size_t pkcs7_len);
+			      const void *raw_pkcs7, size_t pkcs7_len,
+			      enum key_being_used_for usage);
 #endif
 
 #endif /* _KEYS_SYSTEM_KEYRING_H */
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index c2bbf672b84e..93e0ff92fb9b 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -41,7 +41,7 @@ enum OID {
 	OID_signed_data,		/* 1.2.840.113549.1.7.2 */
 	/* PKCS#9 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-9(9)} */
 	OID_email_address,		/* 1.2.840.113549.1.9.1 */
-	OID_content_type,		/* 1.2.840.113549.1.9.3 */
+	OID_contentType,		/* 1.2.840.113549.1.9.3 */
 	OID_messageDigest,		/* 1.2.840.113549.1.9.4 */
 	OID_signingTime,		/* 1.2.840.113549.1.9.5 */
 	OID_smimeCapabilites,		/* 1.2.840.113549.1.9.15 */
@@ -54,6 +54,8 @@ enum OID {
 
 	/* Microsoft Authenticode & Software Publishing */
 	OID_msIndirectData,		/* 1.3.6.1.4.1.311.2.1.4 */
+	OID_msStatementType,		/* 1.3.6.1.4.1.311.2.1.11 */
+	OID_msSpOpusInfo,		/* 1.3.6.1.4.1.311.2.1.12 */
 	OID_msPeImageDataObjId,		/* 1.3.6.1.4.1.311.2.1.15 */
 	OID_msIndividualSPKeyPurpose,	/* 1.3.6.1.4.1.311.2.1.21 */
 	OID_msOutlookExpress,		/* 1.3.6.1.4.1.311.16.4 */
diff --git a/include/linux/verify_pefile.h b/include/linux/verify_pefile.h
index ac34819214f9..da2049b5161c 100644
--- a/include/linux/verify_pefile.h
+++ b/include/linux/verify_pefile.h
@@ -12,7 +12,11 @@
 #ifndef _LINUX_VERIFY_PEFILE_H
 #define _LINUX_VERIFY_PEFILE_H
 
+#include <crypto/public_key.h>
+
 extern int verify_pefile_signature(const void *pebuf, unsigned pelen,
-				   struct key *trusted_keyring, bool *_trusted);
+				   struct key *trusted_keyring,
+				   enum key_being_used_for usage,
+				   bool *_trusted);
 
 #endif /* _LINUX_VERIFY_PEFILE_H */
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 70ad463f6df0..bd62f5cda746 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -72,5 +72,6 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 		return -EBADMSG;
 	}
 
-	return system_verify_data(mod, modlen, mod + modlen, sig_len);
+	return system_verify_data(mod, modlen, mod + modlen, sig_len,
+				  VERIFYING_MODULE_SIGNATURE);
 }
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 95f2dcbc7616..2570598b784d 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -113,9 +113,11 @@ late_initcall(load_system_certificate_list);
  * @len: Size of @data.
  * @raw_pkcs7: The PKCS#7 message that is the signature.
  * @pkcs7_len: The size of @raw_pkcs7.
+ * @usage: The use to which the key is being put.
  */
 int system_verify_data(const void *data, unsigned long len,
-		       const void *raw_pkcs7, size_t pkcs7_len)
+		       const void *raw_pkcs7, size_t pkcs7_len,
+		       enum key_being_used_for usage)
 {
 	struct pkcs7_message *pkcs7;
 	bool trusted;
@@ -132,7 +134,7 @@ int system_verify_data(const void *data, unsigned long len,
 		goto error;
 	}
 
-	ret = pkcs7_verify(pkcs7);
+	ret = pkcs7_verify(pkcs7, usage);
 	if (ret < 0)
 		goto error;
 
diff --git a/scripts/sign-file.c b/scripts/sign-file.c
index de213e5c0cd3..e9741e879bbd 100755
--- a/scripts/sign-file.c
+++ b/scripts/sign-file.c
@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 	bool sign_only = false;
 	unsigned char buf[4096];
 	unsigned long module_size, cms_size;
-	unsigned int use_keyid = 0;
+	unsigned int use_keyid = 0, use_signed_attrs = CMS_NOATTR;
 	const EVP_MD *digest_algo;
 	EVP_PKEY *private_key;
 	CMS_ContentInfo *cms;
@@ -216,7 +216,8 @@ int main(int argc, char **argv)
 	ERR(!cms, "CMS_sign");
 
 	ERR(!CMS_add1_signer(cms, x509, private_key, digest_algo,
-			     CMS_NOCERTS | CMS_BINARY | CMS_NOSMIMECAP | use_keyid),
+			     CMS_NOCERTS | CMS_BINARY | CMS_NOSMIMECAP |
+			     use_keyid | use_signed_attrs),
 	    "CMS_sign_add_signer");
 	ERR(CMS_final(cms, bm, NULL, CMS_NOCERTS | CMS_BINARY) < 0,
 	    "CMS_final");
-- 
cgit v1.2.3-70-g09d2


From 648ed94038c030245a06e4be59744fd5cdc18c40 Mon Sep 17 00:00:00 2001
From: "Chen, Gong" <gong.chen@linux.intel.com>
Date: Wed, 12 Aug 2015 18:29:34 +0200
Subject: x86/mce: Provide a lockless memory pool to save error records

printk() is not safe to use in MCE context. Add a lockless
memory allocator pool to save error records in MCE context.
Those records will be issued later, in a printk-safe context.
The idea is inspired by the APEI/GHES driver.

We're very conservative and allocate only two pages for it but
since we're going to use those pages throughout the system's
lifetime, we allocate them statically to avoid early boot time
allocation woes.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
[ Rewrite. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1439396985-12812-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                          |  1 +
 arch/x86/kernel/cpu/mcheck/Makefile       |  2 +-
 arch/x86/kernel/cpu/mcheck/mce-genpool.c  | 99 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce-internal.h | 12 ++++
 arch/x86/kernel/cpu/mcheck/mce.c          |  8 ++-
 5 files changed, 120 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/mcheck/mce-genpool.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b3a1a5d77d92..06dbb5da90c6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -955,6 +955,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 
 config X86_MCE
 	bool "Machine Check / overheating reporting"
+	select GENERIC_ALLOCATOR
 	default y
 	---help---
 	  Machine Check support allows the processor to notify the
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index bb34b03af252..a3311c886194 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,4 +1,4 @@
-obj-y				=  mce.o mce-severity.o
+obj-y				=  mce.o mce-severity.o mce-genpool.o
 
 obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
new file mode 100644
index 000000000000..0a850100c594
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -0,0 +1,99 @@
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "mce-internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough. Use
+ * 2 pages to save MCE events for now (~80 MCE records at most).
+ */
+#define MCE_POOLSZ	(2 * PAGE_SIZE)
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+static char gen_pool_buf[MCE_POOLSZ];
+
+void mce_gen_pool_process(void)
+{
+	struct llist_node *head;
+	struct mce_evt_llist *node;
+	struct mce *mce;
+
+	head = llist_del_all(&mce_event_llist);
+	if (!head)
+		return;
+
+	head = llist_reverse_order(head);
+	llist_for_each_entry(node, head, llnode) {
+		mce = &node->mce;
+		atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+		gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+	}
+}
+
+bool mce_gen_pool_empty(void)
+{
+	return llist_empty(&mce_event_llist);
+}
+
+int mce_gen_pool_add(struct mce *mce)
+{
+	struct mce_evt_llist *node;
+
+	if (!mce_evt_pool)
+		return -EINVAL;
+
+	node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+	if (!node) {
+		pr_warn_ratelimited("MCE records pool full!\n");
+		return -ENOMEM;
+	}
+
+	memcpy(&node->mce, mce, sizeof(*mce));
+	llist_add(&node->llnode, &mce_event_llist);
+
+	return 0;
+}
+
+static int mce_gen_pool_create(void)
+{
+	struct gen_pool *tmpp;
+	int ret = -ENOMEM;
+
+	tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
+	if (!tmpp)
+		goto out;
+
+	ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+	if (ret) {
+		gen_pool_destroy(tmpp);
+		goto out;
+	}
+
+	mce_evt_pool = tmpp;
+
+out:
+	return ret;
+}
+
+int mce_gen_pool_init(void)
+{
+	/* Just init mce_gen_pool once. */
+	if (mce_evt_pool)
+		return 0;
+
+	return mce_gen_pool_create();
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fe32074b865b..ea8b62264c14 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,6 +13,8 @@ enum severity_level {
 	MCE_PANIC_SEVERITY,
 };
 
+extern struct atomic_notifier_head x86_mce_decoder_chain;
+
 #define ATTR_LEN		16
 #define INITIAL_CHECK_INTERVAL	5 * 60 /* 5 minutes */
 
@@ -24,6 +26,16 @@ struct mce_bank {
 	char			attrname[ATTR_LEN];	/* attribute name */
 };
 
+struct mce_evt_llist {
+	struct llist_node llnode;
+	struct mce mce;
+};
+
+void mce_gen_pool_process(void);
+bool mce_gen_pool_empty(void);
+int mce_gen_pool_add(struct mce *mce);
+int mce_gen_pool_init(void);
+
 extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
 struct dentry *mce_get_debugfs_dir(void);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index df919ff103c3..a41c014e5cde 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -118,7 +118,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
  */
-static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
@@ -1731,6 +1731,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 		return;
 	}
 
+	if (mce_gen_pool_init()) {
+		mca_cfg.disabled = true;
+		pr_emerg("Couldn't allocate MCE records pool!\n");
+		return;
+	}
+
 	machine_check_vector = do_machine_check;
 
 	__mcheck_cpu_init_generic();
-- 
cgit v1.2.3-70-g09d2


From 061120aed7081b9a4393fbe07b558192f40ad911 Mon Sep 17 00:00:00 2001
From: "Chen, Gong" <gong.chen@linux.intel.com>
Date: Wed, 12 Aug 2015 18:29:35 +0200
Subject: x86/mce: Don't use percpu workqueues

An MCE is a rare event. Therefore, there's no need to have
per-CPU instances of both normal and IRQ workqueues. Make them
both global.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
[ Fold in subsequent patch from Rui/Boris/Tony for early boot logging. ]
Signed-off-by: Tony Luck <tony.luck@intel.com>
[ Massage commit message. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1439396985-12812-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a41c014e5cde..456f8d7b8fd3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -110,7 +110,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  */
 mce_banks_t mce_banks_ce_disabled;
 
-static DEFINE_PER_CPU(struct work_struct, mce_work);
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
 
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 
@@ -526,11 +527,9 @@ int mce_available(struct cpuinfo_x86 *c)
 static void mce_schedule_work(void)
 {
 	if (!mce_ring_empty())
-		schedule_work(this_cpu_ptr(&mce_work));
+		schedule_work(&mce_work);
 }
 
-static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
-
 static void mce_irq_work_cb(struct irq_work *entry)
 {
 	mce_notify_irq();
@@ -551,7 +550,7 @@ static void mce_report_event(struct pt_regs *regs)
 		return;
 	}
 
-	irq_work_queue(this_cpu_ptr(&mce_irq_work));
+	irq_work_queue(&mce_irq_work);
 }
 
 /*
@@ -1742,8 +1741,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_timer();
-	INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
-	init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
 }
 
 /*
@@ -2064,6 +2061,9 @@ int __init mcheck_init(void)
 	mcheck_intel_therm_init();
 	mcheck_vendor_init_severity();
 
+	INIT_WORK(&mce_work, mce_process_work);
+	init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From fd4cf79fcc4b5130ced8fd8c40378d3cec2e5fa8 Mon Sep 17 00:00:00 2001
From: "Chen, Gong" <gong.chen@linux.intel.com>
Date: Wed, 12 Aug 2015 18:29:36 +0200
Subject: x86/mce: Remove the MCE ring for Action Optional errors

Use unified genpool to save Action Optional error events and put
Action Optional error handling in the same notification chain as
MCE error decoding.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
[ Fold in subsequent patch from Boris for early boot logging. ]
Signed-off-by: Tony Luck <tony.luck@intel.com>
[ Correct a lot. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1439396985-12812-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h       |   2 +-
 arch/x86/kernel/cpu/mcheck/mce.c | 135 +++++++++++++++++----------------------
 drivers/acpi/acpi_extlog.c       |   2 +-
 drivers/edac/i7core_edac.c       |   2 +-
 drivers/edac/mce_amd.c           |   2 +-
 drivers/edac/sb_edac.c           |   2 +-
 6 files changed, 65 insertions(+), 80 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 982dfc3679ad..dfaa4de1dbb4 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -140,7 +140,7 @@ struct mce_vendor_flags {
 extern struct mce_vendor_flags mce_flags;
 
 extern struct mca_config mca_cfg;
-extern void mce_register_decode_chain(struct notifier_block *nb);
+extern void mce_register_decode_chain(struct notifier_block *nb, bool drain);
 extern void mce_unregister_decode_chain(struct notifier_block *nb);
 
 #include <linux/percpu.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 456f8d7b8fd3..82603690b65c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -114,6 +114,7 @@ static struct work_struct mce_work;
 static struct irq_work mce_irq_work;
 
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+static int mce_usable_address(struct mce *m);
 
 /*
  * CPU/chipset specific EDAC code can register a notifier call here to print
@@ -234,11 +235,18 @@ static void drain_mcelog_buffer(void)
 	} while (next != prev);
 }
 
+static struct notifier_block mce_srao_nb;
 
-void mce_register_decode_chain(struct notifier_block *nb)
+void mce_register_decode_chain(struct notifier_block *nb, bool drain)
 {
+	/* Ensure SRAO notifier has the highest priority in the decode chain. */
+	if (nb != &mce_srao_nb && nb->priority == INT_MAX)
+		nb->priority -= 1;
+
 	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
-	drain_mcelog_buffer();
+
+	if (drain)
+		drain_mcelog_buffer();
 }
 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 
@@ -462,61 +470,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 	}
 }
 
-/*
- * Simple lockless ring to communicate PFNs from the exception handler with the
- * process context work function. This is vastly simplified because there's
- * only a single reader and a single writer.
- */
-#define MCE_RING_SIZE 16	/* we use one entry less */
-
-struct mce_ring {
-	unsigned short start;
-	unsigned short end;
-	unsigned long ring[MCE_RING_SIZE];
-};
-static DEFINE_PER_CPU(struct mce_ring, mce_ring);
-
-/* Runs with CPU affinity in workqueue */
-static int mce_ring_empty(void)
-{
-	struct mce_ring *r = this_cpu_ptr(&mce_ring);
-
-	return r->start == r->end;
-}
-
-static int mce_ring_get(unsigned long *pfn)
-{
-	struct mce_ring *r;
-	int ret = 0;
-
-	*pfn = 0;
-	get_cpu();
-	r = this_cpu_ptr(&mce_ring);
-	if (r->start == r->end)
-		goto out;
-	*pfn = r->ring[r->start];
-	r->start = (r->start + 1) % MCE_RING_SIZE;
-	ret = 1;
-out:
-	put_cpu();
-	return ret;
-}
-
-/* Always runs in MCE context with preempt off */
-static int mce_ring_add(unsigned long pfn)
-{
-	struct mce_ring *r = this_cpu_ptr(&mce_ring);
-	unsigned next;
-
-	next = (r->end + 1) % MCE_RING_SIZE;
-	if (next == r->start)
-		return -1;
-	r->ring[r->end] = pfn;
-	wmb();
-	r->end = next;
-	return 0;
-}
-
 int mce_available(struct cpuinfo_x86 *c)
 {
 	if (mca_cfg.disabled)
@@ -526,7 +479,7 @@ int mce_available(struct cpuinfo_x86 *c)
 
 static void mce_schedule_work(void)
 {
-	if (!mce_ring_empty())
+	if (!mce_gen_pool_empty() && keventd_up())
 		schedule_work(&mce_work);
 }
 
@@ -553,6 +506,27 @@ static void mce_report_event(struct pt_regs *regs)
 	irq_work_queue(&mce_irq_work);
 }
 
+static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct mce *mce = (struct mce *)data;
+	unsigned long pfn;
+
+	if (!mce)
+		return NOTIFY_DONE;
+
+	if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
+		pfn = mce->addr >> PAGE_SHIFT;
+		memory_failure(pfn, MCE_VECTOR, 0);
+	}
+
+	return NOTIFY_OK;
+}
+static struct notifier_block mce_srao_nb = {
+	.notifier_call	= srao_decode_notifier,
+	.priority = INT_MAX,
+};
+
 /*
  * Read ADDR and MISC registers.
  */
@@ -671,8 +645,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 */
 		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
 			if (m.status & MCI_STATUS_ADDRV) {
-				mce_ring_add(m.addr >> PAGE_SHIFT);
-				mce_schedule_work();
+				m.severity = severity;
+				m.usable_addr = mce_usable_address(&m);
+
+				if (!mce_gen_pool_add(&m))
+					mce_schedule_work();
 			}
 		}
 
@@ -1142,15 +1119,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 		mce_read_aux(&m, i);
 
-		/*
-		 * Action optional error. Queue address for later processing.
-		 * When the ring overflows we just ignore the AO error.
-		 * RED-PEN add some logging mechanism when
-		 * usable_address or mce_add_ring fails.
-		 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
-		 */
-		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
-			mce_ring_add(m.addr >> PAGE_SHIFT);
+		/* assuming valid severity level != 0 */
+		m.severity = severity;
+		m.usable_addr = mce_usable_address(&m);
+		mce_gen_pool_add(&m);
 
 		mce_log(&m);
 
@@ -1246,14 +1218,11 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 /*
  * Action optional processing happens here (picking up
  * from the list of faulting pages that do_machine_check()
- * placed into the "ring").
+ * placed into the genpool).
  */
 static void mce_process_work(struct work_struct *dummy)
 {
-	unsigned long pfn;
-
-	while (mce_ring_get(&pfn))
-		memory_failure(pfn, MCE_VECTOR, 0);
+	mce_gen_pool_process();
 }
 
 #ifdef CONFIG_X86_MCE_INTEL
@@ -2059,6 +2028,7 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
+	mce_register_decode_chain(&mce_srao_nb, false);
 	mcheck_vendor_init_severity();
 
 	INIT_WORK(&mce_work, mce_process_work);
@@ -2597,5 +2567,20 @@ static int __init mcheck_debugfs_init(void)
 
 	return 0;
 }
-late_initcall(mcheck_debugfs_init);
+#else
+static int __init mcheck_debugfs_init(void) { return -EINVAL; }
 #endif
+
+static int __init mcheck_late_init(void)
+{
+	mcheck_debugfs_init();
+
+	/*
+	 * Flush out everything that has been logged during early boot, now that
+	 * everything has been initialized (workqueues, decoders, ...).
+	 */
+	mce_schedule_work();
+
+	return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index b3842ffc19ba..07e012e74c1b 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -286,7 +286,7 @@ static int __init extlog_init(void)
 	 */
 	old_edac_report_status = get_edac_report_status();
 	set_edac_report_status(EDAC_REPORTING_DISABLED);
-	mce_register_decode_chain(&extlog_mce_dec);
+	mce_register_decode_chain(&extlog_mce_dec, true);
 	/* enable OS to be involved to take over management from BIOS */
 	((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
 
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 01087a38da22..13d77f4a892c 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -2424,7 +2424,7 @@ static int __init i7core_init(void)
 	pci_rc = pci_register_driver(&i7core_driver);
 
 	if (pci_rc >= 0) {
-		mce_register_decode_chain(&i7_mce_dec);
+		mce_register_decode_chain(&i7_mce_dec, true);
 		return 0;
 	}
 
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 58586d59bf8e..aca31a237073 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -895,7 +895,7 @@ static int __init mce_amd_init(void)
 
 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
 
-	mce_register_decode_chain(&amd_mce_dec_nb);
+	mce_register_decode_chain(&amd_mce_dec_nb, true);
 
 	return 0;
 }
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index ca7831168298..5780e26c3e58 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -2591,7 +2591,7 @@ static int __init sbridge_init(void)
 
 	pci_rc = pci_register_driver(&sbridge_driver);
 	if (pci_rc >= 0) {
-		mce_register_decode_chain(&sbridge_mce_dec);
+		mce_register_decode_chain(&sbridge_mce_dec, true);
 		if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
 			sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From f29a7aff4bd60ebc3da4982f80144a4158c4c74a Mon Sep 17 00:00:00 2001
From: "Chen, Gong" <gong.chen@linux.intel.com>
Date: Wed, 12 Aug 2015 18:29:37 +0200
Subject: x86/mce: Avoid potential deadlock due to printk() in MCE context

Printing in MCE context is a no-no, currently, as printk() is
not NMI-safe. If some of the notifiers on the MCE chain call do
so, we may deadlock. In order to avoid that, delay printk() to
process context where it is safe.

Reported-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
[ Fold in subsequent patch from Boris for early boot logging. ]
Signed-off-by: Tony Luck <tony.luck@intel.com>
[ Kick irq_work in mce_log() directly. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1439396985-12812-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c  | 1 -
 arch/x86/kernel/cpu/mcheck/mce.c       | 4 ++--
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index a1aef9533154..34c89a3e8260 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
 
 	m.addr = mem_err->physical_addr;
 	mce_log(&m);
-	mce_notify_irq();
 }
 EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 82603690b65c..9568bb55bfe2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -159,7 +159,8 @@ void mce_log(struct mce *mce)
 	/* Emit the trace record: */
 	trace_mce_record(mce);
 
-	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+	if (!mce_gen_pool_add(mce))
+		irq_work_queue(&mce_irq_work);
 
 	mce->finished = 0;
 	wmb();
@@ -1122,7 +1123,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		/* assuming valid severity level != 0 */
 		m.severity = severity;
 		m.usable_addr = mce_usable_address(&m);
-		mce_gen_pool_add(&m);
 
 		mce_log(&m);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 844f56c5616d..70f567f774ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -246,7 +246,6 @@ static void intel_threshold_interrupt(void)
 		return;
 
 	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
-	mce_notify_irq();
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From eef4dfa0cb83899c782935ac5345532f47073cea Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 12 Aug 2015 18:29:38 +0200
Subject: x86/mce: Kill drain_mcelog_buffer()

This used to flush out MCEs logged during early boot and which
were in the MCA registers from a previous system run. No need
for that now, since we've moved to a genpool.

Suggested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1439396985-12812-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h       |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c | 44 ++--------------------------------------
 drivers/acpi/acpi_extlog.c       |  2 +-
 drivers/edac/i7core_edac.c       |  2 +-
 drivers/edac/mce_amd.c           |  2 +-
 drivers/edac/sb_edac.c           |  2 +-
 6 files changed, 7 insertions(+), 47 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index dfaa4de1dbb4..982dfc3679ad 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -140,7 +140,7 @@ struct mce_vendor_flags {
 extern struct mce_vendor_flags mce_flags;
 
 extern struct mca_config mca_cfg;
-extern void mce_register_decode_chain(struct notifier_block *nb, bool drain);
+extern void mce_register_decode_chain(struct notifier_block *nb);
 extern void mce_unregister_decode_chain(struct notifier_block *nb);
 
 #include <linux/percpu.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9568bb55bfe2..32b586ee006a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -199,55 +199,15 @@ void mce_log(struct mce *mce)
 	set_bit(0, &mce_need_notify);
 }
 
-static void drain_mcelog_buffer(void)
-{
-	unsigned int next, i, prev = 0;
-
-	next = ACCESS_ONCE(mcelog.next);
-
-	do {
-		struct mce *m;
-
-		/* drain what was logged during boot */
-		for (i = prev; i < next; i++) {
-			unsigned long start = jiffies;
-			unsigned retries = 1;
-
-			m = &mcelog.entry[i];
-
-			while (!m->finished) {
-				if (time_after_eq(jiffies, start + 2*retries))
-					retries++;
-
-				cpu_relax();
-
-				if (!m->finished && retries >= 4) {
-					pr_err("skipping error being logged currently!\n");
-					break;
-				}
-			}
-			smp_rmb();
-			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
-		}
-
-		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
-		prev = next;
-		next = cmpxchg(&mcelog.next, prev, 0);
-	} while (next != prev);
-}
-
 static struct notifier_block mce_srao_nb;
 
-void mce_register_decode_chain(struct notifier_block *nb, bool drain)
+void mce_register_decode_chain(struct notifier_block *nb)
 {
 	/* Ensure SRAO notifier has the highest priority in the decode chain. */
 	if (nb != &mce_srao_nb && nb->priority == INT_MAX)
 		nb->priority -= 1;
 
 	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
-
-	if (drain)
-		drain_mcelog_buffer();
 }
 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 
@@ -2028,7 +1988,7 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
-	mce_register_decode_chain(&mce_srao_nb, false);
+	mce_register_decode_chain(&mce_srao_nb);
 	mcheck_vendor_init_severity();
 
 	INIT_WORK(&mce_work, mce_process_work);
diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index 07e012e74c1b..b3842ffc19ba 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -286,7 +286,7 @@ static int __init extlog_init(void)
 	 */
 	old_edac_report_status = get_edac_report_status();
 	set_edac_report_status(EDAC_REPORTING_DISABLED);
-	mce_register_decode_chain(&extlog_mce_dec, true);
+	mce_register_decode_chain(&extlog_mce_dec);
 	/* enable OS to be involved to take over management from BIOS */
 	((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
 
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 13d77f4a892c..01087a38da22 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -2424,7 +2424,7 @@ static int __init i7core_init(void)
 	pci_rc = pci_register_driver(&i7core_driver);
 
 	if (pci_rc >= 0) {
-		mce_register_decode_chain(&i7_mce_dec, true);
+		mce_register_decode_chain(&i7_mce_dec);
 		return 0;
 	}
 
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index aca31a237073..58586d59bf8e 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -895,7 +895,7 @@ static int __init mce_amd_init(void)
 
 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
 
-	mce_register_decode_chain(&amd_mce_dec_nb, true);
+	mce_register_decode_chain(&amd_mce_dec_nb);
 
 	return 0;
 }
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 5780e26c3e58..ca7831168298 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -2591,7 +2591,7 @@ static int __init sbridge_init(void)
 
 	pci_rc = pci_register_driver(&sbridge_driver);
 	if (pci_rc >= 0) {
-		mce_register_decode_chain(&sbridge_mce_dec, true);
+		mce_register_decode_chain(&sbridge_mce_dec);
 		if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
 			sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From 8838eb6c0bf3b6a6494a163947ab3d1700ab45d2 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Wed, 12 Aug 2015 18:29:40 +0200
Subject: x86/mce: Clear Local MCE opt-in before kexec

kexec could boot a kernel that could be legacy with no knowledge
of LMCE. Hence we should make sure we clear LMCE optin before
kexec reboot.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1439396985-12812-9-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h             |  4 ++++
 arch/x86/kernel/cpu/mcheck/mce.c       | 30 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 19 ++++++++++++++++++-
 arch/x86/kernel/process.c              |  2 ++
 arch/x86/kernel/smp.c                  |  2 ++
 5 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 38d3a1a8830f..2dbc0bf2b9f3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -151,10 +151,12 @@ extern int mce_p5_enabled;
 #ifdef CONFIG_X86_MCE
 int mcheck_init(void);
 void mcheck_cpu_init(struct cpuinfo_x86 *c);
+void mcheck_cpu_clear(struct cpuinfo_x86 *c);
 void mcheck_vendor_init_severity(void);
 #else
 static inline int mcheck_init(void) { return 0; }
 static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
 static inline void mcheck_vendor_init_severity(void) {}
 #endif
 
@@ -181,12 +183,14 @@ DECLARE_PER_CPU(struct device *, mce_device);
 
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void mce_intel_feature_clear(struct cpuinfo_x86 *c);
 void cmci_clear(void);
 void cmci_reenable(void);
 void cmci_rediscover(void);
 void cmci_recheck(void);
 #else
 static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }
 static inline void cmci_clear(void) {}
 static inline void cmci_reenable(void) {}
 static inline void cmci_rediscover(void) {}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 32b586ee006a..ee5272d77a16 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1606,6 +1606,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	}
 }
 
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+	switch (c->x86_vendor) {
+	case X86_VENDOR_INTEL:
+		mce_intel_feature_clear(c);
+		break;
+	default:
+		break;
+	}
+}
+
 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
 {
 	unsigned long iv = check_interval * HZ;
@@ -1672,6 +1683,25 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_timer();
 }
 
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+	if (mca_cfg.disabled)
+		return;
+
+	if (!mce_available(c))
+		return;
+
+	/*
+	 * Possibly to clear general settings generic to x86
+	 * __mcheck_cpu_clear_generic(c);
+	 */
+	__mcheck_cpu_clear_vendor(c);
+
+}
+
 /*
  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
  */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 70f567f774ed..c5c003291861 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -434,7 +434,7 @@ static void intel_init_cmci(void)
 	cmci_recheck();
 }
 
-void intel_init_lmce(void)
+static void intel_init_lmce(void)
 {
 	u64 val;
 
@@ -447,9 +447,26 @@ void intel_init_lmce(void)
 		wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
 }
 
+static void intel_clear_lmce(void)
+{
+	u64 val;
+
+	if (!lmce_supported())
+		return;
+
+	rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+	val &= ~MCG_EXT_CTL_LMCE_EN;
+	wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
 	intel_init_cmci();
 	intel_init_lmce();
 }
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+	intel_clear_lmce();
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 397688beed4b..b20ef187ff41 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -29,6 +29,7 @@
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
+#include <asm/mce.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -319,6 +320,7 @@ void stop_this_cpu(void *dummy)
 	 */
 	set_cpu_online(smp_processor_id(), false);
 	disable_local_APIC();
+	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 
 	for (;;)
 		halt();
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 15aaa69bbb5e..12c8286206ce 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -30,6 +30,7 @@
 #include <asm/proto.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
+#include <asm/mce.h>
 #include <asm/trace/irq_vectors.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
@@ -243,6 +244,7 @@ static void native_stop_other_cpus(int wait)
 finish:
 	local_irq_save(flags);
 	disable_local_APIC();
+	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1b48465500611a2dc5e75800c61ac352e22d41c3 Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Wed, 12 Aug 2015 18:29:41 +0200
Subject: x86/mce: Reenable CMCI banks when swiching back to interrupt mode

Zhang Liguang reported the following issue:

1) System detects a CMCI storm on the current CPU.

2) Kernel disables the CMCI interrupt on banks owned by the
   current CPU and switches to poll mode

3) After the CMCI storm subsides, kernel switches back to
   interrupt mode

4) We expect the system to reenable the CMCI interrupt on banks
   owned by the current CPU

   mce_intel_adjust_timer
   |-> cmci_reenable
       |-> cmci_discover     # owned banks are ignored here

  static void cmci_discover(int banks)
	...
	for (i = 0; i < banks; i++) {
		...
		if (test_bit(i, owned))	# ownd banks is ignore here
			continue;

So convert cmci_storm_disable_banks() to
cmci_toggle_interrupt_mode() which controls whether to enable or
disable CMCI interrupts with its argument.

NB: We cannot clear the owned bit because the banks won't be
polled, otherwise. See:

  27f6c573e0f7 ("x86, CMCI: Add proper detection of end of CMCI storms")

for more info.

Reported-by: Zhang Liguang <zhangliguang@huawei.com>
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # v3.15+
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: huawei.libin@huawei.com
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: rui.xiang@huawei.com
Link: http://lkml.kernel.org/r/1439396985-12812-10-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 41 +++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index c5c003291861..1e8bb6c94f14 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu)
 	per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
 }
 
+static void cmci_toggle_interrupt_mode(bool on)
+{
+	unsigned long flags, *owned;
+	int bank;
+	u64 val;
+
+	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+	owned = this_cpu_ptr(mce_banks_owned);
+	for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+		rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+
+		if (on)
+			val |= MCI_CTL2_CMCI_EN;
+		else
+			val &= ~MCI_CTL2_CMCI_EN;
+
+		wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+	}
+	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
 unsigned long cmci_intel_adjust_timer(unsigned long interval)
 {
 	if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
@@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
 		 */
 		if (!atomic_read(&cmci_storm_on_cpus)) {
 			__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
-			cmci_reenable();
+			cmci_toggle_interrupt_mode(true);
 			cmci_recheck();
 		}
 		return CMCI_POLL_INTERVAL;
@@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
 	}
 }
 
-static void cmci_storm_disable_banks(void)
-{
-	unsigned long flags, *owned;
-	int bank;
-	u64 val;
-
-	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
-	owned = this_cpu_ptr(mce_banks_owned);
-	for_each_set_bit(bank, owned, MAX_NR_BANKS) {
-		rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
-		val &= ~MCI_CTL2_CMCI_EN;
-		wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
-	}
-	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
 static bool cmci_storm_detect(void)
 {
 	unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -223,7 +228,7 @@ static bool cmci_storm_detect(void)
 	if (cnt <= CMCI_STORM_THRESHOLD)
 		return false;
 
-	cmci_storm_disable_banks();
+	cmci_toggle_interrupt_mode(false);
 	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
 	r = atomic_add_return(1, &cmci_storm_on_cpus);
 	mce_timer_kick(CMCI_STORM_INTERVAL);
-- 
cgit v1.2.3-70-g09d2


From 9a7783d02197f299f71b4fa2364a345c05f92b83 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 12 Aug 2015 18:29:43 +0200
Subject: x86/mce: Rename rcu_dereference_check_mce() to
 mce_log_get_idx_check()

The "rcu_" prefix misleads for it being a proper RCU interface
which is not. It basically checks whether we're preemptible or
holding the chrdev_read mutex.

Rename it accordingly.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1439396985-12812-12-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ee5272d77a16..b979711452a5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -52,11 +52,11 @@
 
 static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
-#define rcu_dereference_check_mce(p) \
+#define mce_log_get_idx_check(p) \
 ({ \
 	rcu_lockdep_assert(rcu_read_lock_sched_held() || \
 			   lockdep_is_held(&mce_chrdev_read_mutex), \
-			   "suspicious rcu_dereference_check_mce() usage"); \
+			   "suspicious mce_log_get_idx_check() usage"); \
 	smp_load_acquire(&(p)); \
 })
 
@@ -165,7 +165,7 @@ void mce_log(struct mce *mce)
 	mce->finished = 0;
 	wmb();
 	for (;;) {
-		entry = rcu_dereference_check_mce(mcelog.next);
+		entry = mce_log_get_idx_check(mcelog.next);
 		for (;;) {
 
 			/*
@@ -1812,7 +1812,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
 			goto out;
 	}
 
-	next = rcu_dereference_check_mce(mcelog.next);
+	next = mce_log_get_idx_check(mcelog.next);
 
 	/* Only supports full reads right now */
 	err = -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From a79da38494ec23f1a7d6ee734e07e9575fd18b58 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 12 Aug 2015 18:29:44 +0200
Subject: x86/mce: Add a wrapper around mce_log() for injection

Will be used by an injector module in a following patch.

Additionally, add a missing module export reported by 0-DAY
kernel test.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1439396985-12812-13-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce-internal.h | 2 ++
 arch/x86/kernel/cpu/mcheck/mce.c          | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index ea8b62264c14..547720efd923 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -79,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id)
 	return -EINVAL;
 }
 #endif
+
+void mce_inject_log(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b979711452a5..e4e6646cac46 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -199,6 +199,14 @@ void mce_log(struct mce *mce)
 	set_bit(0, &mce_need_notify);
 }
 
+void mce_inject_log(struct mce *m)
+{
+	mutex_lock(&mce_chrdev_read_mutex);
+	mce_log(m);
+	mutex_unlock(&mce_chrdev_read_mutex);
+}
+EXPORT_SYMBOL_GPL(mce_inject_log);
+
 static struct notifier_block mce_srao_nb;
 
 void mce_register_decode_chain(struct notifier_block *nb)
-- 
cgit v1.2.3-70-g09d2


From ed596cde9425509ec6ce88e19f03e9b13b6f518b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 13 Aug 2015 08:25:20 -0700
Subject: Revert x86 sigcontext cleanups

This reverts commits 9a036b93a344 ("x86/signal/64: Remove 'fs' and 'gs'
from sigcontext") and c6f2062935c8 ("x86/signal/64: Fix SS handling for
signals delivered to 64-bit programs").

They were cleanups, but they break dosemu by changing the signal return
behavior (and removing 'fs' and 'gs' from the sigcontext struct - while
not actually changing any behavior - causes build problems).

Reported-and-tested-by: Stas Sergeev <stsp@list.ru>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/sigcontext.h      |  6 +++---
 arch/x86/include/uapi/asm/sigcontext.h | 21 +++------------------
 arch/x86/kernel/signal.c               | 26 +++++++++++---------------
 3 files changed, 17 insertions(+), 36 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 6fe6b182c998..9dfce4e0417d 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -57,9 +57,9 @@ struct sigcontext {
 	unsigned long ip;
 	unsigned long flags;
 	unsigned short cs;
-	unsigned short __pad2;	/* Was called gs, but was always zero. */
-	unsigned short __pad1;	/* Was called fs, but was always zero. */
-	unsigned short ss;
+	unsigned short gs;
+	unsigned short fs;
+	unsigned short __pad0;
 	unsigned long err;
 	unsigned long trapno;
 	unsigned long oldmask;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 0e8a973de9ee..40836a9a7250 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -177,24 +177,9 @@ struct sigcontext {
 	__u64 rip;
 	__u64 eflags;		/* RFLAGS */
 	__u16 cs;
-
-	/*
-	 * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
-	 * Linux saved and restored fs and gs in these slots.  This
-	 * was counterproductive, as fsbase and gsbase were never
-	 * saved, so arch_prctl was presumably unreliable.
-	 *
-	 * If these slots are ever needed for any other purpose, there
-	 * is some risk that very old 64-bit binaries could get
-	 * confused.  I doubt that many such binaries still work,
-	 * though, since the same patch in 2.5.64 also removed the
-	 * 64-bit set_thread_area syscall, so it appears that there is
-	 * no TLS API that works in both pre- and post-2.5.64 kernels.
-	 */
-	__u16 __pad2;		/* Was gs. */
-	__u16 __pad1;		/* Was fs. */
-
-	__u16 ss;
+	__u16 gs;
+	__u16 fs;
+	__u16 __pad0;
 	__u64 err;
 	__u64 trapno;
 	__u64 oldmask;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 206996c1669d..71820c42b6ce 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -93,8 +93,15 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 		COPY(r15);
 #endif /* CONFIG_X86_64 */
 
+#ifdef CONFIG_X86_32
 		COPY_SEG_CPL3(cs);
 		COPY_SEG_CPL3(ss);
+#else /* !CONFIG_X86_32 */
+		/* Kernel saves and restores only the CS segment register on signals,
+		 * which is the bare minimum needed to allow mixed 32/64-bit code.
+		 * App's signal handler can save/restore other segments if needed. */
+		COPY_SEG_CPL3(cs);
+#endif /* CONFIG_X86_32 */
 
 		get_user_ex(tmpflags, &sc->flags);
 		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -154,9 +161,8 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 #else /* !CONFIG_X86_32 */
 		put_user_ex(regs->flags, &sc->flags);
 		put_user_ex(regs->cs, &sc->cs);
-		put_user_ex(0, &sc->__pad2);
-		put_user_ex(0, &sc->__pad1);
-		put_user_ex(regs->ss, &sc->ss);
+		put_user_ex(0, &sc->gs);
+		put_user_ex(0, &sc->fs);
 #endif /* CONFIG_X86_32 */
 
 		put_user_ex(fpstate, &sc->fpstate);
@@ -451,19 +457,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 	regs->sp = (unsigned long)frame;
 
-	/*
-	 * Set up the CS and SS registers to run signal handlers in
-	 * 64-bit mode, even if the handler happens to be interrupting
-	 * 32-bit or 16-bit code.
-	 *
-	 * SS is subtle.  In 64-bit mode, we don't need any particular
-	 * SS descriptor, but we do need SS to be valid.  It's possible
-	 * that the old SS is entirely bogus -- this can happen if the
-	 * signal we're trying to deliver is #GP or #SS caused by a bad
-	 * SS value.
-	 */
+	/* Set up the CS register to run signal handlers in 64-bit mode,
+	   even if the handler happens to be interrupting 32-bit code. */
 	regs->cs = __USER_CS;
-	regs->ss = __USER_DS;
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 6e38f1e79d16f4fa9e5cf06792500e11c96a6f84 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 16 Aug 2015 11:45:45 -0400
Subject: x86/smpboot: Remove udelay(100) when polling cpu_initialized_map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the BSP sends the APIC INIT/SIPI/SIPI to the AP,
it waits for the AP to come up and indicate that it is alive
by setting its own bit in the cpu_initialized_mask.

Linux polls for up to 10 seconds for this to happen.
Each polling loop has a udelay(100) and a call to schedule().

The udelay(100) adds no value.

For example, on my desktop, the BSP waits for the
other 3 CPUs to come on line at boot for 305, 404, 405 usec.
For resume from S3, it waits 317, 404, 405 usec.

But when the udelay(100) is removed, the BSP waits
305, 310, 306 for boot, and 305, 307, 306 for resume.

So for both boot and resume, removing the udelay(100)
speeds online by about 100us in 2 of 3 cases.

Signed-off-by: Len Brown <len.brown@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jan H. Schönherr <jschoenh@amazon.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
Link: http://lkml.kernel.org/r/33ef746c67d2489cad0a9b1958cf71167232ff2b.1439739165.git.len.brown@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b1f3ed9c7a9e..9ad88fb0a303 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -898,7 +898,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 
 	if (!boot_error) {
 		/*
-		 * Wait 10s total for a response from AP
+		 * Wait 10s total for first sign of life from AP
 		 */
 		boot_error = -1;
 		timeout = jiffies + 10*HZ;
@@ -911,7 +911,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 				boot_error = 0;
 				break;
 			}
-			udelay(100);
 			schedule();
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 2d99af8e8fd6c2dea11ab539f7aba69c37b845b4 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 16 Aug 2015 11:45:46 -0400
Subject: x86/smpboot: Remove udelay(100) when polling cpu_callin_map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the BSP sends INIT/SIPI/SIP to the AP and sees the AP
in the cpu_initialized_map, it sets the AP loose via the
cpu_callout_map, and waits for it via the cpu_callin_map.

The BSP polls the cpu_callin_map with a udelay(100)
and a schedule() in each iteration.

The udelay(100) adds no value.

For example, on my 4-CPU dekstop, the AP finishes
cpu_callin() in under 70 usec and sets the cpu_callin_mask.
The BSP, however, doesn't see that setting until over 30 usec
later, because it was still running its udelay(100)
when the AP finished.

Deleting the udelay(100) in the cpu_callin_mask polling loop,
saves from 0 to 100 usec per Application Processor.

Signed-off-by: Len Brown <len.brown@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jan H. Schönherr <jschoenh@amazon.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
Link: http://lkml.kernel.org/r/0aade12eabeb89a688c929fe80856eaea0544bb7.1439739165.git.len.brown@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9ad88fb0a303..310b6f0bf6f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -926,7 +926,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 			 * for the MTRR work(triggered by the AP coming online)
 			 * to be completed in the stop machine context.
 			 */
-			udelay(100);
 			schedule();
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From a9bcaa02a5104ace6a9d9e4a9cd9192a9e7744d6 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 16 Aug 2015 11:45:47 -0400
Subject: x86/smpboot: Remove SIPI delays from cpu_up()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MPS 1.4 example code shows the following required delays during processor
on-lining:

	INIT
	 udelay(10,000)
	SIPI
	 udelay(200)
	SIPI
	 udelay(200) /* Linux actually implements this as udelay(300) */

Linux skips the udelay(10,000) on modern processors.
This patch removes the udelay(200) after each SIPI
on those same processors.

All three legacy delays can be restored by the cmdline
"cpu_init_udelay=10000".

As measured by analyze_suspend.py, this patch speeds
processor resume time on my desktop from 2.4ms to 1.8ms, per AP.

Signed-off-by: Len Brown <len.brown@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jan H. Schönherr <jschoenh@amazon.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
Link: http://lkml.kernel.org/r/a5dfdbc8fbfdd813784da204aad5677fe459ac37.1439739165.git.len.brown@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 310b6f0bf6f7..674026455fee 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -665,7 +665,8 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
-		udelay(300);
+		if (init_udelay)
+			udelay(300);
 
 		pr_debug("Startup point 1\n");
 
@@ -675,7 +676,8 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
-		udelay(200);
+		if (init_udelay)
+			udelay(200);
 
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
-- 
cgit v1.2.3-70-g09d2


From 656bba306827a44ed73b3f93f75bb3147de17fae Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 16 Aug 2015 11:45:48 -0400
Subject: x86/smpboot: Remove APIC.wait_for_init_deassert and atomic
 init_deasserted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both the per-APIC flag ".wait_for_init_deassert",
and the global atomic_t "init_deasserted"
are dead code -- remove them.

For all APIC types, "wait_for_master()"
prevents an AP from proceeding until the BSP has set
cpu_callout_mask, making "init_deasserted" {unnecessary}:

	BSP: <de-assert INIT>
	...
	BSP: {set init_deasserted}
	AP: wait_for_master()
		set cpu_initialized_mask
		wait for cpu_callout_mask
	BSP: test cpu_initialized_mask
	BSP: set cpu_callout_mask
	AP: test cpu_callout_mask
	AP: {wait for init_deasserted}
	...
	AP: <touch APIC>

Deleting the {dead code} above is necessary to enable
some parallelism in a future patch.

Signed-off-by: Len Brown <len.brown@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jan H. Schönherr <jschoenh@amazon.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
Link: http://lkml.kernel.org/r/de4b3a9bab894735e285870b5296da25ee6a8a5a.1439739165.git.len.brown@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apic.h           |  2 --
 arch/x86/kernel/apic/apic_flat_64.c   |  2 --
 arch/x86/kernel/apic/apic_noop.c      |  1 -
 arch/x86/kernel/apic/apic_numachip.c  |  2 --
 arch/x86/kernel/apic/bigsmp_32.c      |  1 -
 arch/x86/kernel/apic/probe_32.c       |  1 -
 arch/x86/kernel/apic/x2apic_cluster.c |  1 -
 arch/x86/kernel/apic/x2apic_phys.c    |  1 -
 arch/x86/kernel/apic/x2apic_uv_x.c    |  2 --
 arch/x86/kernel/smpboot.c             | 16 +++-------------
 10 files changed, 3 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index c8393634ca0c..ebf6d5e5668c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,7 +313,6 @@ struct apic {
 	/* wakeup_secondary_cpu */
 	int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
 
-	bool wait_for_init_deassert;
 	void (*inquire_remote_apic)(int apicid);
 
 	/* apic ops */
@@ -378,7 +377,6 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[];
  * APIC functionality to boot other CPUs - only used on SMP:
  */
 #ifdef CONFIG_SMP
-extern atomic_t init_deasserted;
 extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
 #endif
 
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index de918c410eae..f92ab36979a2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -191,7 +191,6 @@ static struct apic apic_flat =  {
 	.send_IPI_all			= flat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
@@ -299,7 +298,6 @@ static struct apic apic_physflat =  {
 	.send_IPI_all			= physflat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index b205cdbdbe6a..0d96749cfcac 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -152,7 +152,6 @@ struct apic apic_noop = {
 
 	.wakeup_secondary_cpu		= noop_wakeup_secondary_cpu,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= noop_apic_read,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 017149cded07..b548fd3b764b 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -92,7 +92,6 @@ static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 
 	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
 
-	atomic_set(&init_deasserted, 1);
 	return 0;
 }
 
@@ -235,7 +234,6 @@ static const struct apic apic_numachip __refconst = {
 	.send_IPI_self			= numachip_send_IPI_self,
 
 	.wakeup_secondary_cpu		= numachip_wakeup_secondary,
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL, /* REMRD not supported */
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index c4a8d63f8220..971cf8875939 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -186,7 +186,6 @@ static struct apic apic_bigsmp = {
 	.send_IPI_all			= bigsmp_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wait_for_init_deassert		= true,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index bda488680dbc..7694ae6c1199 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -111,7 +111,6 @@ static struct apic apic_default = {
 	.send_IPI_all			= default_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wait_for_init_deassert		= true,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ab3219b3fbda..1b6c1a4526a5 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -272,7 +272,6 @@ static struct apic apic_x2apic_cluster = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 3ffd925655e0..662e9150ea6f 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -128,7 +128,6 @@ static struct apic apic_x2apic_phys = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c8d92950bc04..4a139465f1d4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -248,7 +248,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 	    APIC_DM_STARTUP;
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 
-	atomic_set(&init_deasserted, 1);
 	return 0;
 }
 
@@ -414,7 +413,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.send_IPI_self			= uv_send_IPI_self,
 
 	.wakeup_secondary_cpu		= uv_wakeup_secondary,
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 674026455fee..c15d0073c829 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,8 +97,6 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
 
-atomic_t init_deasserted;
-
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
@@ -146,16 +144,11 @@ static void smp_callin(void)
 
 	/*
 	 * If waken up by an INIT in an 82489DX configuration
-	 * we may get here before an INIT-deassert IPI reaches
-	 * our local APIC.  We have to wait for the IPI or we'll
-	 * lock up on an APIC access.
-	 *
-	 * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
+	 * cpu_callout_mask guarantees we don't get here before
+	 * an INIT_deassert IPI reaches our local APIC, so it is
+	 * now safe to touch our local APIC.
 	 */
 	cpuid = smp_processor_id();
-	if (apic->wait_for_init_deassert && cpuid)
-		while (!atomic_read(&init_deasserted))
-			cpu_relax();
 
 	/*
 	 * (This works even if the APIC is not enabled.)
@@ -620,7 +613,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	send_status = safe_apic_wait_icr_idle();
 
 	mb();
-	atomic_set(&init_deasserted, 1);
 
 	/*
 	 * Should we send STARTUP IPIs ?
@@ -861,8 +853,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	 * the targeted processor.
 	 */
 
-	atomic_set(&init_deasserted, 0);
-
 	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 
 		pr_debug("Setting warm reset code and vector.\n");
-- 
cgit v1.2.3-70-g09d2


From 527f0a91e91cd55ec79fce80451b0ad5d5e6a21a Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Tue, 18 Aug 2015 23:20:20 +0800
Subject: x86/irq: Build correct vector mapping for multiple MSI interrupts

Alex Deucher, Mark Rustad and Alexander Holler reported a regression
with the latest v4.2-rc4 kernel, which breaks some SATA controllers.
With multi-MSI capable SATA controllers, only the first port works,
all other ports time out when executing SATA commands.

This happens because the first argument to assign_irq_vector_policy()
is always the base linux irq number of the multi MSI interrupt block,
so all subsequent vector assignments operate on the base linux irq
number, so all MSI irqs are handled as the first irq number. Therefor
the other MSI irqs of a device are never set up correctly and never
fire.

Add the loop iterator to the base irq number so all vectors are
assigned correctly.

Fixes: b5dc8e6c21e7 "x86/irq: Use hierarchical irqdomain to manage CPU interrupt vectors"
Reported-and-tested-by: Alex Deucher <alexdeucher@gmail.com>
Reported-and-tested-by: Mark Rustad <mrustad@gmail.com>
Reported-and-tested-by: Alexander Holler <holler@ahsoftware.de>
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1439911228-9880-1-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/vector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f813261d9740..2683f36e4e0a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -322,7 +322,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 		irq_data->chip = &lapic_controller;
 		irq_data->chip_data = data;
 		irq_data->hwirq = virq + i;
-		err = assign_irq_vector_policy(virq, irq_data->node, data,
+		err = assign_irq_vector_policy(virq + i, irq_data->node, data,
 					       info);
 		if (err)
 			goto error;
-- 
cgit v1.2.3-70-g09d2


From 7a67832c7e44c20935c5d6f2264035a0f7bf0d8f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 19 Aug 2015 00:34:34 -0400
Subject: libnvdimm, e820: make CONFIG_X86_PMEM_LEGACY a tristate option

We currently register a platform device for e820 type-12 memory and
register a nvdimm bus beneath it.  Registering the platform device
triggers the device-core machinery to probe for a driver, but that
search currently comes up empty.  Building the nvdimm-bus registration
into the e820_pmem platform device registration in this way forces
libnvdimm to be built-in.  Instead, convert the built-in portion of
CONFIG_X86_PMEM_LEGACY to simply register a platform device and move the
rest of the logic to the driver for e820_pmem, for the following
reasons:

1/ Letting e820_pmem support be a module allows building and testing
   libnvdimm.ko changes without rebooting

2/ All the normal policy around modules can be applied to e820_pmem
   (unbind to disable and/or blacklisting the module from loading by
   default)

3/ Moving the driver to a generic location and converting it to scan
   "iomem_resource" rather than "e820.map" means any other architecture can
   take advantage of this simple nvdimm resource discovery mechanism by
   registering a resource named "Persistent Memory (legacy)"

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/Kconfig                 |  6 ++-
 arch/x86/include/uapi/asm/e820.h |  2 +-
 arch/x86/kernel/Makefile         |  2 +-
 arch/x86/kernel/pmem.c           | 79 ++++--------------------------------
 drivers/nvdimm/Makefile          |  3 ++
 drivers/nvdimm/e820.c            | 86 ++++++++++++++++++++++++++++++++++++++++
 tools/testing/nvdimm/Kbuild      |  4 ++
 7 files changed, 108 insertions(+), 74 deletions(-)
 create mode 100644 drivers/nvdimm/e820.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b3a1a5d77d92..76c61154ed50 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1426,10 +1426,14 @@ config ILLEGAL_POINTER_VALUE
 
 source "mm/Kconfig"
 
+config X86_PMEM_LEGACY_DEVICE
+	bool
+
 config X86_PMEM_LEGACY
-	bool "Support non-standard NVDIMMs and ADR protected memory"
+	tristate "Support non-standard NVDIMMs and ADR protected memory"
 	depends on PHYS_ADDR_T_64BIT
 	depends on BLK_DEV
+	select X86_PMEM_LEGACY_DEVICE
 	select LIBNVDIMM
 	help
 	  Treat memory marked using the non-standard e820 type of 12 as used
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 0f457e6eab18..9dafe59cf6e2 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -37,7 +37,7 @@
 /*
  * This is a non-standardized way to represent ADR or NVDIMM regions that
  * persist over a reboot.  The kernel will ignore their special capabilities
- * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
  *
  * ( Note that older platforms also used 6 for the same type of memory,
  *   but newer versions switched to 12 as 6 was assigned differently.  Some
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f15af41bd80..ac2bb7e28ba2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -92,7 +92,7 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
-obj-$(CONFIG_X86_PMEM_LEGACY)	+= pmem.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 64f90f53bb85..4f00b63d7ff3 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -3,80 +3,17 @@
  * Copyright (c) 2015, Intel Corporation.
  */
 #include <linux/platform_device.h>
-#include <linux/libnvdimm.h>
 #include <linux/module.h>
-#include <asm/e820.h>
-
-static void e820_pmem_release(struct device *dev)
-{
-	struct nvdimm_bus *nvdimm_bus = dev->platform_data;
-
-	if (nvdimm_bus)
-		nvdimm_bus_unregister(nvdimm_bus);
-}
-
-static struct platform_device e820_pmem = {
-	.name = "e820_pmem",
-	.id = -1,
-	.dev = {
-		.release = e820_pmem_release,
-	},
-};
-
-static const struct attribute_group *e820_pmem_attribute_groups[] = {
-	&nvdimm_bus_attribute_group,
-	NULL,
-};
-
-static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
-	&nd_region_attribute_group,
-	&nd_device_attribute_group,
-	NULL,
-};
 
 static __init int register_e820_pmem(void)
 {
-	static struct nvdimm_bus_descriptor nd_desc;
-	struct device *dev = &e820_pmem.dev;
-	struct nvdimm_bus *nvdimm_bus;
-	int rc, i;
-
-	rc = platform_device_register(&e820_pmem);
-	if (rc)
-		return rc;
-
-	nd_desc.attr_groups = e820_pmem_attribute_groups;
-	nd_desc.provider_name = "e820";
-	nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
-	if (!nvdimm_bus)
-		goto err;
-	dev->platform_data = nvdimm_bus;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		struct resource res = {
-			.flags	= IORESOURCE_MEM,
-			.start	= ei->addr,
-			.end	= ei->addr + ei->size - 1,
-		};
-		struct nd_region_desc ndr_desc;
-
-		if (ei->type != E820_PRAM)
-			continue;
-
-		memset(&ndr_desc, 0, sizeof(ndr_desc));
-		ndr_desc.res = &res;
-		ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
-		ndr_desc.numa_node = NUMA_NO_NODE;
-		if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
-			goto err;
-	}
-
-	return 0;
-
- err:
-	dev_err(dev, "failed to register legacy persistent memory ranges\n");
-	platform_device_unregister(&e820_pmem);
-	return -ENXIO;
+	struct platform_device *pdev;
+
+	/*
+	 * See drivers/nvdimm/e820.c for the implementation, this is
+	 * simply here to trigger the module to load on demand.
+	 */
+	pdev = platform_device_alloc("e820_pmem", -1);
+	return platform_device_add(pdev);
 }
 device_initcall(register_e820_pmem);
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 594bb97c867a..9bf15db52dee 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
 obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 
 nd_pmem-y := pmem.o
 
@@ -9,6 +10,8 @@ nd_btt-y := btt.o
 
 nd_blk-y := blk.o
 
+nd_e820-y := e820.o
+
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
new file mode 100644
index 000000000000..1b5743ad92db
--- /dev/null
+++ b/drivers/nvdimm/e820.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
+ */
+#include <linux/platform_device.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
+
+static const struct attribute_group *e820_pmem_attribute_groups[] = {
+	&nvdimm_bus_attribute_group,
+	NULL,
+};
+
+static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
+	&nd_region_attribute_group,
+	&nd_device_attribute_group,
+	NULL,
+};
+
+static int e820_pmem_remove(struct platform_device *pdev)
+{
+	struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev);
+
+	nvdimm_bus_unregister(nvdimm_bus);
+	return 0;
+}
+
+static int e820_pmem_probe(struct platform_device *pdev)
+{
+	static struct nvdimm_bus_descriptor nd_desc;
+	struct device *dev = &pdev->dev;
+	struct nvdimm_bus *nvdimm_bus;
+	struct resource *p;
+
+	nd_desc.attr_groups = e820_pmem_attribute_groups;
+	nd_desc.provider_name = "e820";
+	nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
+	if (!nvdimm_bus)
+		goto err;
+	platform_set_drvdata(pdev, nvdimm_bus);
+
+	for (p = iomem_resource.child; p ; p = p->sibling) {
+		struct nd_region_desc ndr_desc;
+
+		if (strncmp(p->name, "Persistent Memory (legacy)", 26) != 0)
+			continue;
+
+		memset(&ndr_desc, 0, sizeof(ndr_desc));
+		ndr_desc.res = p;
+		ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
+		ndr_desc.numa_node = NUMA_NO_NODE;
+		if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+			goto err;
+	}
+
+	return 0;
+
+ err:
+	nvdimm_bus_unregister(nvdimm_bus);
+	dev_err(dev, "failed to register legacy persistent memory ranges\n");
+	return -ENXIO;
+}
+
+static struct platform_driver e820_pmem_driver = {
+	.probe = e820_pmem_probe,
+	.remove = e820_pmem_remove,
+	.driver = {
+		.name = "e820_pmem",
+	},
+};
+
+static __init int e820_pmem_init(void)
+{
+	return platform_driver_register(&e820_pmem_driver);
+}
+
+static __exit void e820_pmem_exit(void)
+{
+	platform_driver_unregister(&e820_pmem_driver);
+}
+
+MODULE_ALIAS("platform:e820_pmem*");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+module_init(e820_pmem_init);
+module_exit(e820_pmem_exit);
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index f56914c7929b..d7c136a96346 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -15,6 +15,7 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
 obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_ACPI_NFIT) += nfit.o
 
 nfit-y := $(ACPI_SRC)/nfit.o
@@ -29,6 +30,9 @@ nd_btt-y += config_check.o
 nd_blk-y := $(NVDIMM_SRC)/blk.o
 nd_blk-y += config_check.o
 
+nd_e820-y := $(NVDIMM_SRC)/e820.o
+nd_e820-y += config_check.o
+
 libnvdimm-y := $(NVDIMM_SRC)/core.o
 libnvdimm-y += $(NVDIMM_SRC)/bus.o
 libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o
-- 
cgit v1.2.3-70-g09d2


From e43d0189ac02415fe4487f79fc35e8f147e9ea0d Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Thu, 20 Aug 2015 12:54:39 +0800
Subject: x86/idle: Restore trace_cpu_idle to mwait_idle() calls

Commit b253149b843f ("sched/idle/x86: Restore mwait_idle() to fix boot
hangs, to improve power savings and to improve performance") restores
mwait_idle(), but the trace_cpu_idle related calls are missing. This
causes powertop on my old desktop powered by Intel Core2 E6550 to
report zero wakeups and zero events.

Add them back to restore the proper behaviour.

Fixes: b253149b843f ("sched/idle/x86: Restore mwait_idle() to ...")
Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Cc: <len.brown@intel.com>
Cc: stable@vger.kernel.org # 4.1
Link: http://lkml.kernel.org/r/1440046479-4262-1-git-send-email-jszhang@marvell.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 397688beed4b..c27cad726765 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -408,6 +408,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
 static void mwait_idle(void)
 {
 	if (!current_set_polling_and_test()) {
+		trace_cpu_idle_rcuidle(1, smp_processor_id());
 		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
 			smp_mb(); /* quirk */
 			clflush((void *)&current_thread_info()->flags);
@@ -419,6 +420,7 @@ static void mwait_idle(void)
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
+		trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 	} else {
 		local_irq_enable();
 	}
-- 
cgit v1.2.3-70-g09d2


From 82819ffb42fb45197bacf3223191deca31d3eb91 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 21 Aug 2015 08:14:46 +0200
Subject: perf/x86/msr: Fix the MSR driver build

The new MSR PMU driver made use of rdtsc() which does not exist (yet) in
this tree:

  arch/x86/kernel/cpu/perf_event_msr.c:91:3: error: implicit declaration of function 'rdtsc'

Use the old rdtscll() primitive for now.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_msr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
index b0dd2e8a6d12..086b12eae794 100644
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -145,7 +145,7 @@ static inline u64 msr_read_counter(struct perf_event *event)
 	if (event->hw.event_base)
 		rdmsrl(event->hw.event_base, now);
 	else
-		now = rdtsc();
+		rdtscll(now);
 
 	return now;
 }
-- 
cgit v1.2.3-70-g09d2


From 88c9281a9fba67636ab26c1fd6afbc78a632374f Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 19 Aug 2015 09:54:24 -0700
Subject: x86/hyperv: Mark the Hyper-V TSC as unstable

The Hyper-V top-level functional specification states, that
"algorithms should be resilient to sudden jumps forward or
backward in the TSC value", this means that we should consider
TSC as unstable. In some cases tsc tests are able to detect the
instability, it was detected in 543 out of 646 boots in my
testing:

 Measured 6277 cycles TSC warp between CPUs, turning off TSC clock.
 tsc: Marking TSC unstable due to check_tsc_sync_source failed

This is, however, just a heuristic. On Hyper-V platform there
are two good clocksources: MSR-based hyperv_clocksource and
recently introduced TSC page.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: devel@linuxdriverproject.org
Link: http://lkml.kernel.org/r/1440003264-9949-1-git-send-email-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mshyperv.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index aad4bd84b475..6fd023d7492d 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -141,6 +141,7 @@ static void __init ms_hyperv_init_platform(void)
 	no_timer_check = 1;
 #endif
 
+	mark_tsc_unstable("running on Hyper-V");
 }
 
 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
-- 
cgit v1.2.3-70-g09d2


From 5fc960380ea44ba529c78b558b6cd4250e5e1958 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 22 Aug 2015 09:52:06 +0200
Subject: x86/fpu/math-emu: Fix math-emu boot crash

On a math-emu bootup the following crash occurs:

	Initializing CPU#0
	------------[ cut here ]------------
	kernel BUG at arch/x86/kernel/traps.c:779!
	invalid opcode: 0000 [#1] SMP
	[...]
	EIP is at do_device_not_available+0xe/0x70
	[...]
	Call Trace:
	 [<c18238e6>] error_code+0x5a/0x60
	 [<c1002bd0>] ? math_error+0x140/0x140
	 [<c100bbd9>] ? fpu__init_cpu+0x59/0xa0
	 [<c1012322>] cpu_init+0x202/0x330
	 [<c104509f>] ? __native_set_fixmap+0x1f/0x30
	 [<c1b56ab0>] trap_init+0x305/0x346
	 [<c1b548af>] start_kernel+0x1a5/0x35d
	 [<c1b542b4>] i386_start_kernel+0x82/0x86

The reason is that in the following commit:

  b1276c48e91b ("x86/fpu: Initialize fpregs in fpu__init_cpu_generic()")

I failed to consider math-emu's limitation that it cannot execute the
FNINIT instruction in kernel mode.

The long term fix might be to allow math-emu to execute (certain) kernel
mode FPU instructions, but for now apply the safe (albeit somewhat ugly)
fix: initialize the emulation state explicitly without trapping out to
the FPU emulator.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/init.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 1e173f6285c7..d14e9ac3235a 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -40,7 +40,12 @@ static void fpu__init_cpu_generic(void)
 	write_cr0(cr0);
 
 	/* Flush out any pending x87 state: */
-	asm volatile ("fninit");
+#ifdef CONFIG_MATH_EMULATION
+	if (!cpu_has_fpu)
+		fpstate_init_soft(&current->thread.fpu.state.soft);
+	else
+#endif
+		asm volatile ("fninit");
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 827409b2f5b58573ae3774fe6bd2d6daeb335878 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 27 May 2015 12:22:29 +0200
Subject: x86/fpu/math-emu: Fix crash in fork()

During later stages of math-emu bootup the following crash triggers:

	 math_emulate: 0060:c100d0a8
	 Kernel panic - not syncing: Math emulation needed in kernel
	 CPU: 0 PID: 1511 Comm: login Not tainted 4.2.0-rc7+ #1012
	 [...]
	 Call Trace:
	  [<c181d50d>] dump_stack+0x41/0x52
	  [<c181c918>] panic+0x77/0x189
	  [<c1003530>] ? math_error+0x140/0x140
	  [<c164c2d7>] math_emulate+0xba7/0xbd0
	  [<c100d0a8>] ? fpu__copy+0x138/0x1c0
	  [<c1109c3c>] ? __alloc_pages_nodemask+0x12c/0x870
	  [<c136ac20>] ? proc_clear_tty+0x40/0x70
	  [<c136ac6e>] ? session_clear_tty+0x1e/0x30
	  [<c1003530>] ? math_error+0x140/0x140
	  [<c1003575>] do_device_not_available+0x45/0x70
	  [<c100d0a8>] ? fpu__copy+0x138/0x1c0
	  [<c18258e6>] error_code+0x5a/0x60
	  [<c1003530>] ? math_error+0x140/0x140
	  [<c100d0a8>] ? fpu__copy+0x138/0x1c0
	  [<c100c205>] arch_dup_task_struct+0x25/0x30
	  [<c1048cea>] copy_process.part.51+0xea/0x1480
	  [<c115a8e5>] ? dput+0x175/0x200
	  [<c136af70>] ? no_tty+0x30/0x30
	  [<c1157242>] ? do_vfs_ioctl+0x322/0x540
	  [<c104a21a>] _do_fork+0xca/0x340
	  [<c1057b06>] ? SyS_rt_sigaction+0x66/0x90
	  [<c104a557>] SyS_clone+0x27/0x30
	  [<c1824a80>] sysenter_do_call+0x12/0x12

The reason is the incorrect assumption in fpu_copy(), that FNSAVE
can be executed from math-emu kernels as well.

Don't try to copy the registers, the soft state will be copied
by fork anyway, so the child task inherits the parent task's
soft math state.

With this fix applied math-emu kernels boot up fine on modern
hardware and the 'no387 nofxsr' boot options.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Bobby Powers <bobbypowers@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 79de954626fd..d25097c3fc1d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -270,7 +270,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	dst_fpu->fpregs_active = 0;
 	dst_fpu->last_cpu = -1;
 
-	if (src_fpu->fpstate_active)
+	if (src_fpu->fpstate_active && cpu_has_fpu)
 		fpu_copy(dst_fpu, src_fpu);
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From f0a97af83f6287357dcc100c859ec0066f164f32 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 20 Aug 2015 22:03:21 -0700
Subject: x86/traps: Weaken context tracking entry assertions

We were asserting that we were all the way in CONTEXT_KERNEL
when exception handlers were called.  While having this be true
is, I think, a nice goal (or maybe a variant in which we assert
that we're in CONTEXT_KERNEL or some new IRQ context), we're not
quite there.

In particular, if an IRQ interrupts the SYSCALL prologue and the
IRQ handler in turn causes an exception, the exception entry
will be called in RCU IRQ mode but with CONTEXT_USER.

This is okay (nothing goes wrong), but until we fix up the
SYSCALL prologue, we need to avoid warning.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c81faf3916346c0e04346c441392974f49cd7184.1440133286.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 86a82eafb96f..45e8d9891fa3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 void ist_enter(struct pt_regs *regs)
 {
 	if (user_mode(regs)) {
-		CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+		rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	} else {
 		/*
 		 * We might have interrupted pretty much anything.  In
@@ -282,7 +282,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 {
 	siginfo_t info;
 
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
@@ -364,7 +364,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	const struct bndcsr *bndcsr;
 	siginfo_t *info;
 
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 		return;
@@ -442,7 +442,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk;
 
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	conditional_sti(regs);
 
 	if (v8086_mode(regs)) {
@@ -496,7 +496,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 		return;
 
 	ist_enter(regs);
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
@@ -729,14 +729,14 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_MF);
 }
 
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_XF);
 }
 
@@ -749,7 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
@@ -775,7 +775,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
 	siginfo_t info;
 
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	local_irq_enable();
 
 	info.si_signo = SIGILL;
-- 
cgit v1.2.3-70-g09d2


From b466bdb614823aaaa7188e85516177d2850f4782 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Mon, 10 Aug 2015 12:19:54 +0200
Subject: x86/asm/delay: Introduce an MWAITX-based delay with a configurable
 timer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MWAITX can enable a timer and a corresponding timer value
specified in SW P0 clocks. The SW P0 frequency is the same as
TSC. The timer provides an upper bound on how long the
instruction waits before exiting.

This way, a delay function in the kernel can leverage that
MWAITX timer of MWAITX.

When a CPU core executes MWAITX, it will be quiesced in a
waiting phase, diminishing its power consumption. This way, we
can save power in comparison to our default TSC-based delays.

A simple test shows that:

	$ cat /sys/bus/pci/devices/0000\:00\:18.4/hwmon/hwmon0/power1_acc
	$ sleep 10000s
	$ cat /sys/bus/pci/devices/0000\:00\:18.4/hwmon/hwmon0/power1_acc

Results:

	* TSC-based default delay:      485115 uWatts average power
	* MWAITX-based delay:           252738 uWatts average power

Thus, that's about 240 milliWatts less power consumption. The
test method relies on the support of AMD CPU accumulated power
algorithm in fam15h_power for which patches are forthcoming.

Suggested-by: Andy Lutomirski <luto@amacapital.net>
Suggested-by: Borislav Petkov <bp@suse.de>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Huang Rui <ray.huang@amd.com>
[ Fix delay truncation. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andreas Herrmann <herrmann.der.user@gmail.com>
Cc: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Jacob Shin <jacob.w.shin@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Li <tony.li@amd.com>
Link: http://lkml.kernel.org/r/1438744732-1459-3-git-send-email-ray.huang@amd.com
Link: http://lkml.kernel.org/r/1439201994-28067-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/delay.h |  1 +
 arch/x86/kernel/cpu/amd.c    |  4 ++++
 arch/x86/lib/delay.c         | 47 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 9b3b4f2754c7..36a760bda462 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -4,5 +4,6 @@
 #include <asm-generic/delay.h>
 
 void use_tsc_delay(void);
+void use_mwaitx_delay(void);
 
 #endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 51ad2af84a72..4a70fc6d400a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -11,6 +11,7 @@
 #include <asm/cpu.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
+#include <asm/delay.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/mmconfig.h>
@@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		/* A random value per boot for bit slice [12:upper_bit) */
 		va_align.bits = get_random_int() & va_align.mask;
 	}
+
+	if (cpu_has(c, X86_FEATURE_MWAITX))
+		use_mwaitx_delay();
 }
 
 static void early_init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 4453d52a143d..e912b2f6d36e 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -20,6 +20,7 @@
 #include <asm/processor.h>
 #include <asm/delay.h>
 #include <asm/timer.h>
+#include <asm/mwait.h>
 
 #ifdef CONFIG_SMP
 # include <asm/smp.h>
@@ -83,6 +84,44 @@ static void delay_tsc(unsigned long __loops)
 	preempt_enable();
 }
 
+/*
+ * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
+ * counts with TSC frequency. The input value is the loop of the
+ * counter, it will exit when the timer expires.
+ */
+static void delay_mwaitx(unsigned long __loops)
+{
+	u64 start, end, delay, loops = __loops;
+
+	start = rdtsc_ordered();
+
+	for (;;) {
+		delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
+
+		/*
+		 * Use cpu_tss as a cacheline-aligned, seldomly
+		 * accessed per-cpu variable as the monitor target.
+		 */
+		__monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+
+		/*
+		 * AMD, like Intel, supports the EAX hint and EAX=0xf
+		 * means, do not enter any deep C-state and we use it
+		 * here in delay() to minimize wakeup latency.
+		 */
+		__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
+
+		end = rdtsc_ordered();
+
+		if (loops <= end - start)
+			break;
+
+		loops -= end - start;
+
+		start = end;
+	}
+}
+
 /*
  * Since we calibrate only once at boot, this
  * function should be set once at boot and not changed
@@ -91,7 +130,13 @@ static void (*delay_fn)(unsigned long) = delay_loop;
 
 void use_tsc_delay(void)
 {
-	delay_fn = delay_tsc;
+	if (delay_fn == delay_loop)
+		delay_fn = delay_tsc;
+}
+
+void use_mwaitx_delay(void)
+{
+	delay_fn = delay_mwaitx;
 }
 
 int read_current_timer(unsigned long *timer_val)
-- 
cgit v1.2.3-70-g09d2


From a57e456a7b28431b55e407e5ab78ebd5b378d19e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 22 Aug 2015 16:41:17 +0200
Subject: x86/apic: Fix fallout from x2apic cleanup

In the recent x2apic cleanup I got two things really wrong:
1) The safety check in __disable_x2apic which allows the function to
   be called unconditionally is backwards. The check is there to
   prevent access to the apic MSR in case that the machine has no
   apic. Though right now it returns if the machine has an apic and
   therefor the disabling of x2apic is never invoked.

2) x2apic_disable() sets x2apic_mode to 0 after registering the local
   apic. That's wrong, because register_lapic_address() checks x2apic
   mode and therefor takes the wrong code path.

This results in boot failures on machines with x2apic preenabled by
BIOS and can also lead to an fatal MSR access on machines without
apic.

The solutions are simple:
1) Correct the sanity check for apic availability
2) Clear x2apic_mode _before_ calling register_lapic_address()

Fixes: 659006bf3ae3 'x86/x2apic: Split enable and setup function'
Reported-and-tested-by: Javier Monteagudo <javiermon@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1224764
Cc: stable@vger.kernel.org # 4.0+
Cc: Laura Abbott <labbott@redhat.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
---
 arch/x86/kernel/apic/apic.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dcb52850a28f..cde732c1b495 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1424,7 +1424,7 @@ static inline void __x2apic_disable(void)
 {
 	u64 msr;
 
-	if (cpu_has_apic)
+	if (!cpu_has_apic)
 		return;
 
 	rdmsrl(MSR_IA32_APICBASE, msr);
@@ -1483,10 +1483,13 @@ void x2apic_setup(void)
 
 static __init void x2apic_disable(void)
 {
-	u32 x2apic_id;
+	u32 x2apic_id, state = x2apic_state;
 
-	if (x2apic_state != X2APIC_ON)
-		goto out;
+	x2apic_mode = 0;
+	x2apic_state = X2APIC_DISABLED;
+
+	if (state != X2APIC_ON)
+		return;
 
 	x2apic_id = read_apic_id();
 	if (x2apic_id >= 255)
@@ -1494,9 +1497,6 @@ static __init void x2apic_disable(void)
 
 	__x2apic_disable();
 	register_lapic_address(mp_lapic_addr);
-out:
-	x2apic_state = X2APIC_DISABLED;
-	x2apic_mode = 0;
 }
 
 static __init void x2apic_enable(void)
-- 
cgit v1.2.3-70-g09d2


From 47edb65178cb7056c2eea0b6c41a7d8c84547192 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 23 Jul 2015 12:14:40 -0700
Subject: x86/asm/msr: Make wrmsrl() a function

As of cf991de2f614 ("x86/asm/msr: Make wrmsrl_safe() a
function"), wrmsrl_safe is a function, but wrmsrl is still a
macro.  The wrmsrl macro performs invalid shifts if the value
argument is 32 bits. This makes it unnecessarily awkward to
write code that puts an unsigned long into an MSR.

To make this work, syscall_init needs tweaking to stop passing
a function pointer to wrmsrl.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Willy Tarreau <w@1wt.eu>
Link: http://lkml.kernel.org/r/690f0c629a1085d054e2d1ef3da073cfb3f7db92.1437678821.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr.h      | 6 ++++--
 arch/x86/include/asm/paravirt.h | 6 +++++-
 arch/x86/kernel/cpu/common.c    | 6 +++---
 3 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 54e9f088919d..77d8b284e4a7 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -188,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
 #define rdmsrl(msr, val)			\
 	((val) = native_read_msr((msr)))
 
-#define wrmsrl(msr, val)						\
-	native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+	native_write_msr(msr, (u32)val, (u32)(val >> 32));
+}
 
 /* wrmsr with exception handling */
 static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c2be0375bcad..10d0596433f8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -153,7 +153,11 @@ do {						\
 	val = paravirt_read_msr(msr, &_err);	\
 } while (0)
 
-#define wrmsrl(msr, val)	wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+	wrmsr(msr, (u32)val, (u32)(val>>32));
+}
+
 #define wrmsr_safe(msr, a, b)	paravirt_write_msr(msr, a, b)
 
 /* rdmsr with exception handling */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb9e5df42dd2..b128808853a2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1185,10 +1185,10 @@ void syscall_init(void)
 	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
+	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
-	wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
+	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
 	/*
 	 * This only works on Intel CPUs.
 	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1199,7 +1199,7 @@ void syscall_init(void)
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
-	wrmsrl(MSR_CSTAR, ignore_sysret);
+	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
-- 
cgit v1.2.3-70-g09d2


From 13fe86f465b72fc9328d4f5ebc33223c011852ae Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 24 Aug 2015 19:34:55 -0400
Subject: x86/mm: Make kernel/check.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

  arch/x86/Kconfig:config X86_CHECK_BIOS_CORRUPTION
  arch/x86/Kconfig:       bool "Check for low memory corruption"

...meaning that it currently is not being built as a module by
anyone.

Lets remove the couple traces of modularity so that when reading
the code there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the
non-modular case, the init ordering remains unchanged with this
commit.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1440459295-21814-4-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/check.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 83a7995625a6..f40cfd12ba66 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/workqueue.h>
@@ -162,6 +162,5 @@ static int start_periodic_check_for_corruption(void)
 	schedule_delayed_work(&bios_check_work, 0);
 	return 0;
 }
-
-module_init(start_periodic_check_for_corruption);
+device_initcall(start_periodic_check_for_corruption);
 
-- 
cgit v1.2.3-70-g09d2


From 5d0ddfebb93069061880fc57ee4ba7246bd1e1ee Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Fri, 21 Aug 2015 15:36:23 +0800
Subject: ACPI, PCI: Penalize legacy IRQ used by ACPI SCI

Nick Meier reported a regression with HyperV that "
  After rebooting the VM, the following messages are logged in syslog
  when trying to load the tulip driver:
    tulip: Linux Tulip drivers version 1.1.15 (Feb 27, 2007)
    tulip: 0000:00:0a.0: PCI INT A: failed to register GSI
    tulip: Cannot enable tulip board #0, aborting
    tulip: probe of 0000:00:0a.0 failed with error -16
  Errors occur in 3.19.0 kernel
  Works in 3.17 kernel.
"

According to the ACPI dump file posted by Nick at
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1440072

The ACPI MADT table includes an interrupt source overridden entry for
ACPI SCI:
[236h 0566  1]                Subtable Type : 02 <Interrupt Source Override>
[237h 0567  1]                       Length : 0A
[238h 0568  1]                          Bus : 00
[239h 0569  1]                       Source : 09
[23Ah 0570  4]                    Interrupt : 00000009
[23Eh 0574  2]        Flags (decoded below) : 000D
                                   Polarity : 1
                               Trigger Mode : 3

And in DSDT table, we have _PRT method to define PCI interrupts, which
eventually goes to:
        Name (PRSA, ResourceTemplate ()
        {
            IRQ (Level, ActiveLow, Shared, )
                {3,4,5,7,9,10,11,12,14,15}
        })
        Name (PRSB, ResourceTemplate ()
        {
            IRQ (Level, ActiveLow, Shared, )
                {3,4,5,7,9,10,11,12,14,15}
        })
        Name (PRSC, ResourceTemplate ()
        {
            IRQ (Level, ActiveLow, Shared, )
                {3,4,5,7,9,10,11,12,14,15}
        })
        Name (PRSD, ResourceTemplate ()
        {
            IRQ (Level, ActiveLow, Shared, )
                {3,4,5,7,9,10,11,12,14,15}
        })

According to the MADT and DSDT tables, IRQ 9 may be used for:
 1) ACPI SCI in level, high mode
 2) PCI legacy IRQ in level, low mode
So there's a conflict in polarity setting for IRQ 9.

Prior to commit cd68f6bd53cf ("x86, irq, acpi: Get rid of special
handling of GSI for ACPI SCI"), ACPI SCI is handled specially and
there's no check for conflicts between ACPI SCI and PCI legagy IRQ.
And it seems that the HyperV hypervisor doesn't make use of the
polarity configuration in IOAPIC entry, so it just works.

Commit cd68f6bd53cf gets rid of the specially handling of ACPI SCI,
and then the pin attribute checking code discloses the conflicts
between ACPI SCI and PCI legacy IRQ on HyperV virtual machine,
and rejects the request to assign IRQ9 to PCI devices.

So penalize legacy IRQ used by ACPI SCI and mark it unusable if ACPI
SCI attributes conflict with PCI IRQ attributes.

Please refer to following links for more information:
https://bugzilla.kernel.org/show_bug.cgi?id=101301
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1440072

Fixes: cd68f6bd53cf ("x86, irq, acpi: Get rid of special handling of GSI for ACPI SCI")
Reported-and-tested-by: Nick Meier <nmeier@microsoft.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: 3.19+ <stable@vger.kernel.org> # 3.19+
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/kernel/acpi/boot.c |  1 +
 drivers/acpi/pci_link.c     | 16 ++++++++++++++++
 include/linux/acpi.h        |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e49ee24da85e..9393896717d0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -445,6 +445,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
 		polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
 
 	mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+	acpi_penalize_sci_irq(bus_irq, trigger, polarity);
 
 	/*
 	 * stash over-ride to indicate we've been here
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index cfd7581cc19f..b09ad554430a 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -825,6 +825,22 @@ void acpi_penalize_isa_irq(int irq, int active)
 	}
 }
 
+/*
+ * Penalize IRQ used by ACPI SCI. If ACPI SCI pin attributes conflict with
+ * PCI IRQ attributes, mark ACPI SCI as ISA_ALWAYS so it won't be use for
+ * PCI IRQs.
+ */
+void acpi_penalize_sci_irq(int irq, int trigger, int polarity)
+{
+	if (irq >= 0 && irq < ARRAY_SIZE(acpi_irq_penalty)) {
+		if (trigger != ACPI_MADT_TRIGGER_LEVEL ||
+		    polarity != ACPI_MADT_POLARITY_ACTIVE_LOW)
+			acpi_irq_penalty[irq] += PIRQ_PENALTY_ISA_ALWAYS;
+		else
+			acpi_irq_penalty[irq] += PIRQ_PENALTY_PCI_USING;
+	}
+}
+
 /*
  * Over-ride default table to reserve additional IRQs for use by ISA
  * e.g. acpi_irq_isa=5
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d2445fa9999f..0b2394f61af4 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -221,7 +221,7 @@ struct pci_dev;
 
 int acpi_pci_irq_enable (struct pci_dev *dev);
 void acpi_penalize_isa_irq(int irq, int active);
-
+void acpi_penalize_sci_irq(int irq, int trigger, int polarity);
 void acpi_pci_irq_disable (struct pci_dev *dev);
 
 extern int ec_read(u8 addr, u8 *val);
-- 
cgit v1.2.3-70-g09d2


From 2baa891e42d84159b693eadd44f6fe1486285bdc Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Mon, 24 Aug 2015 12:13:33 -0700
Subject: x86/mm/mtrr: Remove kernel internal MTRR interfaces: unexport
 mtrr_add() and mtrr_del()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The effort to replace mtrr_add() with architecture agnostic
arch_phys_wc_add() is complete, this will ensure write-combining
implementations (PAT on x86) is taken advantage instead of using
MTRR. With the effort done now, hide direct MTRR access for
drivers.

The legacy user-space /proc/mtrr ABI is not affected.

Update x86 documentation on MTRR to reflect the completion of
the phasing out of direct access to MTRR, also add a note on
platform firmware code use of MTRRs based on the obituary
discussion of MTRRs on Linux [0].

  [0] http://lkml.kernel.org/r/1438991330.3109.196.camel@hp.com

Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Cc: <syrjala@sci.fi>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Walls <awalls@md.metrocast.net>
Cc: Antonino Daplas <adaplas@gmail.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Doug Ledford <dledford@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Ville Syrjälä <syrjala@sci.fi>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: airlied@linux.ie
Cc: benh@kernel.crashing.org
Cc: bhelgaas@google.com
Cc: dan.j.williams@intel.com
Cc: konrad.wilk@oracle.com
Cc: linux-fbdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-media@vger.kernel.org
Cc: mst@redhat.com
Cc: netdev@vger.kernel.org
Cc: vinod.koul@intel.com
Cc: xen-devel@lists.xensource.com
Link: http://lkml.kernel.org/r/1440443613-13696-12-git-send-email-mcgrof@do-not-panic.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/mtrr.txt      | 20 ++++++++++++++++----
 arch/x86/kernel/cpu/mtrr/main.c |  2 --
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/x86/mtrr.txt b/Documentation/x86/mtrr.txt
index 860bc3adc223..dc3e703913ac 100644
--- a/Documentation/x86/mtrr.txt
+++ b/Documentation/x86/mtrr.txt
@@ -6,10 +6,22 @@ Luis R. Rodriguez <mcgrof@do-not-panic.com> - April 9, 2015
 ===============================================================================
 Phasing out MTRR use
 
-MTRR use is replaced on modern x86 hardware with PAT. Over time the only type
-of effective MTRR that is expected to be supported will be for write-combining.
-As MTRR use is phased out device drivers should use arch_phys_wc_add() to make
-MTRR effective on non-PAT systems while a no-op on PAT enabled systems.
+MTRR use is replaced on modern x86 hardware with PAT. Direct MTRR use by
+drivers on Linux is now completely phased out, device drivers should use
+arch_phys_wc_add() in combination with ioremap_wc() to make MTRR effective on
+non-PAT systems while a no-op but equally effective on PAT enabled systems.
+
+Even if Linux does not use MTRRs directly, some x86 platform firmware may still
+set up MTRRs early before booting the OS. They do this as some platform
+firmware may still have implemented access to MTRRs which would be controlled
+and handled by the platform firmware directly. An example of platform use of
+MTRRs is through the use of SMI handlers, one case could be for fan control,
+the platform code would need uncachable access to some of its fan control
+registers. Such platform access does not need any Operating System MTRR code in
+place other than mtrr_type_lookup() to ensure any OS specific mapping requests
+are aligned with platform MTRR setup. If MTRRs are only set up by the platform
+firmware code though and the OS does not make any specific MTRR mapping
+requests mtrr_type_lookup() should always return MTRR_TYPE_INVALID.
 
 For details refer to Documentation/x86/pat.txt.
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index e7ed0d8ebacb..f891b4750f04 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -448,7 +448,6 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
 			     increment);
 }
-EXPORT_SYMBOL(mtrr_add);
 
 /**
  * mtrr_del_page - delete a memory type region
@@ -537,7 +536,6 @@ int mtrr_del(int reg, unsigned long base, unsigned long size)
 		return -EINVAL;
 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
 }
-EXPORT_SYMBOL(mtrr_del);
 
 /**
  * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
-- 
cgit v1.2.3-70-g09d2


From a47d4576cd1c58157a2d8cfffa93aa7ca375eede Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 28 Aug 2015 10:30:15 +0200
Subject: x86/irq: Do not dereference irq descriptor before checking it

Having the IS_NULL_OR_ERR() check after dereferencing the pointer is
not really working well.

Move the dereference after the check.

Fixes: a782a7e46bb5 'x86/irq: Store irq descriptor in vector array'
Reported-and-tested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq_32.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 217b01388038..c80cf6699678 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -150,7 +150,7 @@ void do_softirq_own_stack(void)
 
 bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 {
-	unsigned int irq = irq_desc_get_irq(desc);
+	unsigned int irq;
 	int overflow;
 
 	overflow = check_stack_overflow();
@@ -158,6 +158,7 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 	if (IS_ERR_OR_NULL(desc))
 		return false;
 
+	irq = irq_desc_get_irq(desc);
 	if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
-- 
cgit v1.2.3-70-g09d2


From aacfbe6a9724bb6d66a656a5abcc681d5649ed92 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 4 Sep 2015 15:45:12 -0700
Subject: kernel/watchdog: move NMI function header declarations from
 watchdog.h to nmi.h

The kernel's NMI watchdog has nothing to do with the watchdog subsystem.
Its header declarations should be in linux/nmi.h, not linux/watchdog.h.

The code provided two sets of dummy functions if HARDLOCKUP_DETECTOR is
not configured, one in the include file and one in kernel/watchdog.c.
Remove the dummy functions from kernel/watchdog.c and use those from the
include file.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
 include/linux/nmi.h                    | 8 +++++---
 include/linux/watchdog.h               | 8 --------
 kernel/watchdog.c                      | 2 --
 4 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3f124d553c5a..36bd8250934b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index f94da0e65dea..088714537d10 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -26,10 +26,12 @@ static inline void touch_nmi_watchdog(void)
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
+void watchdog_nmi_disable_all(void);
+void watchdog_nmi_enable_all(void);
 #else
-static inline void hardlockup_detector_disable(void)
-{
-}
+static inline void hardlockup_detector_disable(void) {}
+static inline void watchdog_nmi_disable_all(void) {}
+static inline void watchdog_nmi_enable_all(void) {}
 #endif
 
 /*
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index f47feada5b42..d74a0e907b9e 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
-#else
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
-#endif
-
 #endif  /* ifndef _LINUX_WATCHDOG_H */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d18330fa4776..e74d48bc3e61 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -651,8 +651,6 @@ unlock:
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 static struct smp_hotplug_thread watchdog_threads = {
-- 
cgit v1.2.3-70-g09d2


From 999bbe49ea0118b70ddf3f5d679f51dc7a97ae55 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:25 -0700
Subject: watchdog: use suspend/resume interface in fixup_ht_bug()

Remove watchdog_nmi_disable_all() and watchdog_nmi_enable_all() since
these functions are no longer needed.  If a subsystem has a need to
deactivate the watchdog temporarily, it should utilize the
watchdog_suspend() and watchdog_resume() functions.

[akpm@linux-foundation.org: fix build with CONFIG_LOCKUP_DETECTOR=m]
Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c |  7 +++++--
 include/linux/nmi.h                    | 13 +++++++++----
 kernel/watchdog.c                      | 35 ----------------------------------
 3 files changed, 14 insertions(+), 41 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 36bd8250934b..144ab91951a7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void)
 		return 0;
 	}
 
-	watchdog_nmi_disable_all();
+	if (watchdog_suspend() != 0) {
+		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+		return 0;
+	}
 
 	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
@@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void)
 	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	watchdog_nmi_enable_all();
+	watchdog_resume();
 
 	get_online_cpus();
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e9f213c337bb..e5afe8bae202 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -26,12 +26,8 @@ static inline void touch_nmi_watchdog(void)
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
 #else
 static inline void hardlockup_detector_disable(void) {}
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
 #endif
 
 /*
@@ -84,6 +80,15 @@ extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
 extern int watchdog_suspend(void);
 extern void watchdog_resume(void);
+#else
+static inline int watchdog_suspend(void)
+{
+	return 0;
+}
+
+static inline void watchdog_resume(void)
+{
+}
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index eb8f94b50101..69666f4b8e8f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -615,41 +615,6 @@ static void watchdog_nmi_disable(unsigned int cpu)
 	}
 }
 
-void watchdog_nmi_enable_all(void)
-{
-	int cpu;
-
-	mutex_lock(&watchdog_proc_mutex);
-
-	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-		goto unlock;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		watchdog_nmi_enable(cpu);
-	put_online_cpus();
-
-unlock:
-	mutex_unlock(&watchdog_proc_mutex);
-}
-
-void watchdog_nmi_disable_all(void)
-{
-	int cpu;
-
-	mutex_lock(&watchdog_proc_mutex);
-
-	if (!watchdog_running)
-		goto unlock;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		watchdog_nmi_disable(cpu);
-	put_online_cpus();
-
-unlock:
-	mutex_unlock(&watchdog_proc_mutex);
-}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-- 
cgit v1.2.3-70-g09d2


From ec6a90661a0d6ce1461d05c7a58a0a151154e14a Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:28 -0700
Subject: watchdog: rename watchdog_suspend() and watchdog_resume()

Rename watchdog_suspend() to lockup_detector_suspend() and
watchdog_resume() to lockup_detector_resume() to avoid confusion with the
watchdog subsystem and to be consistent with the existing name
lockup_detector_init().

Also provide comment blocks to explain the watchdog_running and
watchdog_suspended variables and their relationship.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c |  4 ++--
 include/linux/nmi.h                    |  8 ++++----
 kernel/watchdog.c                      | 26 ++++++++++++++++++++++----
 3 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 144ab91951a7..cd9b6d0b10bf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3627,7 +3627,7 @@ static __init int fixup_ht_bug(void)
 		return 0;
 	}
 
-	if (watchdog_suspend() != 0) {
+	if (lockup_detector_suspend() != 0) {
 		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
 		return 0;
 	}
@@ -3638,7 +3638,7 @@ static __init int fixup_ht_bug(void)
 	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	watchdog_resume();
+	lockup_detector_resume();
 
 	get_online_cpus();
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e5afe8bae202..a91adf6e02f2 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -78,15 +78,15 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
-extern int watchdog_suspend(void);
-extern void watchdog_resume(void);
+extern int lockup_detector_suspend(void);
+extern void lockup_detector_resume(void);
 #else
-static inline int watchdog_suspend(void)
+static inline int lockup_detector_suspend(void)
 {
 	return 0;
 }
 
-static inline void watchdog_resume(void)
+static inline void lockup_detector_resume(void)
 {
 }
 #endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 69666f4b8e8f..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -67,8 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
-static int __read_mostly watchdog_suspended;
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
 static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -669,7 +687,7 @@ static void watchdog_unpark_threads(void)
 /*
  * Suspend the hard and soft lockup detector by parking the watchdog threads.
  */
-int watchdog_suspend(void)
+int lockup_detector_suspend(void)
 {
 	int ret = 0;
 
@@ -679,7 +697,7 @@ int watchdog_suspend(void)
 	 * the 'watchdog_suspended' variable). If the watchdog threads are
 	 * running, the first caller takes care that they will be parked.
 	 * The state of 'watchdog_running' cannot change while a suspend
-	 * request is active (see related changes in 'proc' handlers).
+	 * request is active (see related code in 'proc' handlers).
 	 */
 	if (watchdog_running && !watchdog_suspended)
 		ret = watchdog_park_threads();
@@ -695,7 +713,7 @@ int watchdog_suspend(void)
 /*
  * Resume the hard and soft lockup detector by unparking the watchdog threads.
  */
-void watchdog_resume(void)
+void lockup_detector_resume(void)
 {
 	mutex_lock(&watchdog_proc_mutex);
 
-- 
cgit v1.2.3-70-g09d2


From 5dd2c4bded8776ee93c8f38b739fea531095067f Mon Sep 17 00:00:00 2001
From: Mark Salter <msalter@redhat.com>
Date: Tue, 8 Sep 2015 15:03:07 -0700
Subject: x86: use generic early mem copy

The early_ioremap library now has a generic copy_from_early_mem()
function.  Use the generic copy function for x86 relocate_initrd().

[akpm@linux-foundation.org: remove MAX_MAP_CHUNK define, per Yinghai Lu]
Signed-off-by: Mark Salter <msalter@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/setup.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b143c2d04420..baadbf90a7c5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void)
 	return ramdisk_size;
 }
 
-#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 static void __init relocate_initrd(void)
 {
 	/* Assume only end is not page aligned */
 	u64 ramdisk_image = get_ramdisk_image();
 	u64 ramdisk_size  = get_ramdisk_size();
 	u64 area_size     = PAGE_ALIGN(ramdisk_size);
-	unsigned long slop, clen, mapaddr;
-	char *p, *q;
 
 	/* We need to move the initrd down into directly mapped mem */
 	relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
@@ -343,25 +340,8 @@ static void __init relocate_initrd(void)
 	printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
 	       relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
 
-	q = (char *)initrd_start;
-
-	/* Copy the initrd */
-	while (ramdisk_size) {
-		slop = ramdisk_image & ~PAGE_MASK;
-		clen = ramdisk_size;
-		if (clen > MAX_MAP_CHUNK-slop)
-			clen = MAX_MAP_CHUNK-slop;
-		mapaddr = ramdisk_image & PAGE_MASK;
-		p = early_memremap(mapaddr, clen+slop);
-		memcpy(q, p+slop, clen);
-		early_memunmap(p, clen+slop);
-		q += clen;
-		ramdisk_image += clen;
-		ramdisk_size  -= clen;
-	}
+	copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
 
-	ramdisk_image = get_ramdisk_image();
-	ramdisk_size  = get_ramdisk_size();
 	printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
 		" [mem %#010llx-%#010llx]\n",
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
-- 
cgit v1.2.3-70-g09d2


From 2965faa5e03d1e71e9ff9aa143fff39e0a77543a Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Wed, 9 Sep 2015 15:38:55 -0700
Subject: kexec: split kexec_load syscall from kexec core code

There are two kexec load syscalls, kexec_load another and kexec_file_load.
 kexec_file_load has been splited as kernel/kexec_file.c.  In this patch I
split kexec_load syscall code to kernel/kexec.c.

And add a new kconfig option KEXEC_CORE, so we can disable kexec_load and
use kexec_file_load only, or vice verse.

The original requirement is from Ted Ts'o, he want kexec kernel signature
being checked with CONFIG_KEXEC_VERIFY_SIG enabled.  But kexec-tools use
kexec_load syscall can bypass the checking.

Vivek Goyal proposed to create a common kconfig option so user can compile
in only one syscall for loading kexec kernel.  KEXEC/KEXEC_FILE selects
KEXEC_CORE so that old config files still work.

Because there's general code need CONFIG_KEXEC_CORE, so I updated all the
architecture Kconfig with a new option KEXEC_CORE, and let KEXEC selects
KEXEC_CORE in arch Kconfig.  Also updated general kernel code with to
kexec_load syscall.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Petr Tesarik <ptesarik@suse.cz>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Josh Boyer <jwboyer@fedoraproject.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                  |    3 +
 arch/arm/Kconfig              |    1 +
 arch/ia64/Kconfig             |    1 +
 arch/m68k/Kconfig             |    1 +
 arch/mips/Kconfig             |    1 +
 arch/powerpc/Kconfig          |    1 +
 arch/s390/Kconfig             |    1 +
 arch/sh/Kconfig               |    1 +
 arch/tile/Kconfig             |    1 +
 arch/x86/Kconfig              |    3 +-
 arch/x86/boot/header.S        |    2 +-
 arch/x86/include/asm/kdebug.h |    2 +-
 arch/x86/kernel/Makefile      |    4 +-
 arch/x86/kernel/kvmclock.c    |    4 +-
 arch/x86/kernel/reboot.c      |    4 +-
 arch/x86/kernel/setup.c       |    2 +-
 arch/x86/kernel/vmlinux.lds.S |    2 +-
 arch/x86/kvm/vmx.c            |    8 +-
 arch/x86/platform/efi/efi.c   |    4 +-
 arch/x86/platform/uv/uv_nmi.c |    6 +-
 drivers/firmware/efi/Kconfig  |    2 +-
 drivers/pci/pci-driver.c      |    2 +-
 include/linux/kexec.h         |    6 +-
 init/initramfs.c              |    4 +-
 kernel/Makefile               |    1 +
 kernel/events/core.c          |    2 +-
 kernel/kexec.c                | 1495 +---------------------------------------
 kernel/kexec_core.c           | 1511 +++++++++++++++++++++++++++++++++++++++++
 kernel/ksysfs.c               |    6 +-
 kernel/printk/printk.c        |    2 +-
 kernel/reboot.c               |    2 +-
 kernel/sysctl.c               |    2 +-
 32 files changed, 1560 insertions(+), 1527 deletions(-)
 create mode 100644 kernel/kexec_core.c

(limited to 'arch/x86/kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 8f3564930580..4e949e58b192 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -2,6 +2,9 @@
 # General architecture dependent options
 #
 
+config KEXEC_CORE
+	bool
+
 config OPROFILE
 	tristate "OProfile system profiling"
 	depends on PROFILING
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0d1b717e1eca..72ad724c67ae 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2020,6 +2020,7 @@ config KEXEC
 	bool "Kexec system call (EXPERIMENTAL)"
 	depends on (!SMP || PM_SLEEP_SMP)
 	depends on !CPU_V7M
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 42a91a7aa2b0..eb0249e37981 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -518,6 +518,7 @@ source "drivers/sn/Kconfig"
 config KEXEC
 	bool "kexec system call"
 	depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 2dd8f63bfbbb..498b567f007b 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -95,6 +95,7 @@ config MMU_SUN3
 config KEXEC
 	bool "kexec system call"
 	depends on M68KCLASSIC
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 752acca8de1f..e3aa5b0b4ef1 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2597,6 +2597,7 @@ source "kernel/Kconfig.preempt"
 
 config KEXEC
 	bool "Kexec system call"
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b447918b9e2c..9a7057ec2154 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -420,6 +420,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
 config KEXEC
 	bool "kexec system call"
 	depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP))
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4827870f7a6d..1d57000b1b24 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -48,6 +48,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 
 config KEXEC
 	def_bool y
+	select KEXEC_CORE
 
 config AUDIT_ARCH
 	def_bool y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..d514df7e04dd 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -602,6 +602,7 @@ source kernel/Kconfig.hz
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on SUPERH32 && MMU
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 2ba12d761723..106c21bd7f44 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -205,6 +205,7 @@ source "kernel/Kconfig.hz"
 
 config KEXEC
 	bool "kexec system call"
+	select KEXEC_CORE
 	---help---
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc0d73eac047..7aef2d52daa0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1754,6 +1754,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call"
+	select KEXEC_CORE
 	---help---
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
@@ -1770,8 +1771,8 @@ config KEXEC
 
 config KEXEC_FILE
 	bool "kexec file based system call"
+	select KEXEC_CORE
 	select BUILD_BIN2C
-	depends on KEXEC
 	depends on X86_64
 	depends on CRYPTO=y
 	depends on CRYPTO_SHA256=y
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 16ef02596db2..2d6b309c8e9a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -414,7 +414,7 @@ xloadflags:
 # define XLF23 0
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
 # define XLF4 XLF_EFI_KEXEC
 #else
 # define XLF4 0
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 32ce71375b21..b130d59406fb 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
 extern void __show_regs(struct pt_regs *regs, int all);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 extern int in_crash_kexec;
 #else
 /* no crash dump is ever in progress if no crash kernel can be kexec'd */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9ffdf25e5b86..b1b78ffe01d0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -71,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-y				+= kprobes/
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 49487b488061..2c7aafa70702 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
  * kind of shutdown from our side, we unregister the clock by writting anything
  * that does not have the 'enable' bit set in the msr
  */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
 	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
 	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
 	machine_ops.shutdown  = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
 	kvm_get_preset_lpj();
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 86db4bcd7ce5..02693dd9a079 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
 	.emergency_restart = native_machine_emergency_restart,
 	.restart = native_machine_restart,
 	.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	.crash_shutdown = native_machine_crash_shutdown,
 #endif
 };
@@ -703,7 +703,7 @@ void machine_halt(void)
 	machine_ops.halt();
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 void machine_crash_shutdown(struct pt_regs *regs)
 {
 	machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index baadbf90a7c5..fdb7f2a2d328 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -478,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
  * --------- Crashkernel reservation ------------------------------
  */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd846..74e4bf11f562 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
 
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <asm/kexec.h>
 
 . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 148ea2016022..d01986832afc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This bitmap is used to indicate whether the vmclear
  * operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
 #else
 static inline void crash_enable_local_vmclear(int cpu) { }
 static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
 {
@@ -10411,7 +10411,7 @@ static int __init vmx_init(void)
 	if (r)
 		return r;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
 			   crash_vmclear_local_loaded_vmcss);
 #endif
@@ -10421,7 +10421,7 @@ static int __init vmx_init(void)
 
 static void __exit vmx_exit(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
 	synchronize_rcu();
 #endif
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e4308fe6afe8..1db84c0758b7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 
 static void __init save_runtime_map(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
 	void *tmp, *p, *q = NULL;
 	int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 
 static void __init kexec_enter_virtual_mode(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
 	void *p;
 
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 020c101c255f..5c9f63fa6abf 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
 	touch_nmi_watchdog();
 }
 
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
 static atomic_t uv_nmi_kexec_failed;
 static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 	uv_nmi_sync_exit(0);
 }
 
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
 static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
 	if (master)
 		pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
 }
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_KGDB
 #ifdef CONFIG_KGDB_KDB
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 54071c148340..84533e02fbf8 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -43,7 +43,7 @@ config EFI_VARS_PSTORE_DEFAULT_DISABLE
 
 config EFI_RUNTIME_MAP
 	bool "Export efi runtime maps to sysfs"
-	depends on X86 && EFI && KEXEC
+	depends on X86 && EFI && KEXEC_CORE
 	default y
 	help
 	  Export efi runtime memory maps to /sys/firmware/efi/runtime-map.
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 52a880ca1768..dd652f2ae03d 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -467,7 +467,7 @@ static void pci_device_shutdown(struct device *dev)
 	pci_msi_shutdown(pci_dev);
 	pci_msix_shutdown(pci_dev);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	/*
 	 * If this is a kexec reboot, turn off Bus Master bit on the
 	 * device to tell it to not continue to do DMA. Don't touch
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index ab150ade0d18..d140b1e9faa7 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -16,7 +16,7 @@
 
 #include <uapi/linux/kexec.h>
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <linux/list.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
@@ -329,13 +329,13 @@ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
 int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 					unsigned int relsec);
 
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #define kexec_in_progress false
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 #endif /* !defined(__ASSEBMLY__) */
 
diff --git a/init/initramfs.c b/init/initramfs.c
index ad1bd7787bbb..b32ad7d97ac9 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -526,14 +526,14 @@ extern unsigned long __initramfs_size;
 
 static void __init free_initrd(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
 	unsigned long crashk_end   = (unsigned long)__va(crashk_res.end);
 #endif
 	if (do_retain_initrd)
 		goto skip;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	/*
 	 * If the initrd region is overlapped with crashkernel reserved region,
 	 * free only memory that is not part of crashkernel region.
diff --git a/kernel/Makefile b/kernel/Makefile
index 1b4890af5a65..d4988410b410 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e8183895691c..f548f69c4299 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9094,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
 	mutex_unlock(&swhash->hlist_mutex);
 }
 
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
 	struct remove_event re = { .detach_group = true };
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2d73ecfa5505..4c5edc357923 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,148 +1,23 @@
 /*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
  *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 
-#define pr_fmt(fmt)	"kexec: " fmt
-
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/kexec.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <linux/highmem.h>
 #include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
 #include <linux/vmalloc.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
+#include <linux/slab.h>
 
-#include <crypto/hash.h>
-#include <crypto/sha.h>
 #include "kexec_internal.h"
 
-DEFINE_MUTEX(kexec_mutex);
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
-	/*
-	 * If crash_kexec_post_notifiers is enabled, don't run
-	 * crash_kexec() here yet, which must be run after panic
-	 * notifiers in panic().
-	 */
-	if (crash_kexec_post_notifiers)
-		return 0;
-	/*
-	 * There are 4 panic() calls in do_exit() path, each of which
-	 * corresponds to each of these 4 conditions.
-	 */
-	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
-		return 1;
-	return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses.  On processors
- * where you can disable the MMU this is trivial, and easy.  For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place.  This means I can only support memory whose
- * physical address can fit in an unsigned long.  In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages.  As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it).  The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- *  - allocating a page table with the control code buffer identity
- *    mapped, to simplify machine_kexec and make kexec_on_panic more
- *    reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
-
-static struct page *kimage_alloc_page(struct kimage *image,
-				       gfp_t gfp_mask,
-				       unsigned long dest);
-
 static int copy_user_segment_list(struct kimage *image,
 				  unsigned long nr_segments,
 				  struct kexec_segment __user *segments)
@@ -160,123 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
 	return ret;
 }
 
-int sanity_check_segment_list(struct kimage *image)
-{
-	int result, i;
-	unsigned long nr_segments = image->nr_segments;
-
-	/*
-	 * Verify we have good destination addresses.  The caller is
-	 * responsible for making certain we don't attempt to load
-	 * the new image into invalid or reserved areas of RAM.  This
-	 * just verifies it is an address we can use.
-	 *
-	 * Since the kernel does everything in page size chunks ensure
-	 * the destination addresses are page aligned.  Too many
-	 * special cases crop of when we don't do this.  The most
-	 * insidious is getting overlapping destination addresses
-	 * simply because addresses are changed to page size
-	 * granularity.
-	 */
-	result = -EADDRNOTAVAIL;
-	for (i = 0; i < nr_segments; i++) {
-		unsigned long mstart, mend;
-
-		mstart = image->segment[i].mem;
-		mend   = mstart + image->segment[i].memsz;
-		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-			return result;
-		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-			return result;
-	}
-
-	/* Verify our destination addresses do not overlap.
-	 * If we alloed overlapping destination addresses
-	 * through very weird things can happen with no
-	 * easy explanation as one segment stops on another.
-	 */
-	result = -EINVAL;
-	for (i = 0; i < nr_segments; i++) {
-		unsigned long mstart, mend;
-		unsigned long j;
-
-		mstart = image->segment[i].mem;
-		mend   = mstart + image->segment[i].memsz;
-		for (j = 0; j < i; j++) {
-			unsigned long pstart, pend;
-			pstart = image->segment[j].mem;
-			pend   = pstart + image->segment[j].memsz;
-			/* Do the segments overlap ? */
-			if ((mend > pstart) && (mstart < pend))
-				return result;
-		}
-	}
-
-	/* Ensure our buffer sizes are strictly less than
-	 * our memory sizes.  This should always be the case,
-	 * and it is easier to check up front than to be surprised
-	 * later on.
-	 */
-	result = -EINVAL;
-	for (i = 0; i < nr_segments; i++) {
-		if (image->segment[i].bufsz > image->segment[i].memsz)
-			return result;
-	}
-
-	/*
-	 * Verify we have good destination addresses.  Normally
-	 * the caller is responsible for making certain we don't
-	 * attempt to load the new image into invalid or reserved
-	 * areas of RAM.  But crash kernels are preloaded into a
-	 * reserved area of ram.  We must ensure the addresses
-	 * are in the reserved area otherwise preloading the
-	 * kernel could corrupt things.
-	 */
-
-	if (image->type == KEXEC_TYPE_CRASH) {
-		result = -EADDRNOTAVAIL;
-		for (i = 0; i < nr_segments; i++) {
-			unsigned long mstart, mend;
-
-			mstart = image->segment[i].mem;
-			mend = mstart + image->segment[i].memsz - 1;
-			/* Ensure we are within the crash kernel limits */
-			if ((mstart < crashk_res.start) ||
-			    (mend > crashk_res.end))
-				return result;
-		}
-	}
-
-	return 0;
-}
-
-struct kimage *do_kimage_alloc_init(void)
-{
-	struct kimage *image;
-
-	/* Allocate a controlling structure */
-	image = kzalloc(sizeof(*image), GFP_KERNEL);
-	if (!image)
-		return NULL;
-
-	image->head = 0;
-	image->entry = &image->head;
-	image->last_entry = &image->head;
-	image->control_page = ~0; /* By default this does not apply */
-	image->type = KEXEC_TYPE_DEFAULT;
-
-	/* Initialize the list of control pages */
-	INIT_LIST_HEAD(&image->control_pages);
-
-	/* Initialize the list of destination pages */
-	INIT_LIST_HEAD(&image->dest_pages);
-
-	/* Initialize the list of unusable pages */
-	INIT_LIST_HEAD(&image->unusable_pages);
-
-	return image;
-}
-
 static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 			     unsigned long nr_segments,
 			     struct kexec_segment __user *segments,
@@ -343,597 +101,6 @@ out_free_image:
 	return ret;
 }
 
-int kimage_is_destination_range(struct kimage *image,
-					unsigned long start,
-					unsigned long end)
-{
-	unsigned long i;
-
-	for (i = 0; i < image->nr_segments; i++) {
-		unsigned long mstart, mend;
-
-		mstart = image->segment[i].mem;
-		mend = mstart + image->segment[i].memsz;
-		if ((end > mstart) && (start < mend))
-			return 1;
-	}
-
-	return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
-	struct page *pages;
-
-	pages = alloc_pages(gfp_mask, order);
-	if (pages) {
-		unsigned int count, i;
-		pages->mapping = NULL;
-		set_page_private(pages, order);
-		count = 1 << order;
-		for (i = 0; i < count; i++)
-			SetPageReserved(pages + i);
-	}
-
-	return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
-	unsigned int order, count, i;
-
-	order = page_private(page);
-	count = 1 << order;
-	for (i = 0; i < count; i++)
-		ClearPageReserved(page + i);
-	__free_pages(page, order);
-}
-
-void kimage_free_page_list(struct list_head *list)
-{
-	struct list_head *pos, *next;
-
-	list_for_each_safe(pos, next, list) {
-		struct page *page;
-
-		page = list_entry(pos, struct page, lru);
-		list_del(&page->lru);
-		kimage_free_pages(page);
-	}
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
-							unsigned int order)
-{
-	/* Control pages are special, they are the intermediaries
-	 * that are needed while we copy the rest of the pages
-	 * to their final resting place.  As such they must
-	 * not conflict with either the destination addresses
-	 * or memory the kernel is already using.
-	 *
-	 * The only case where we really need more than one of
-	 * these are for architectures where we cannot disable
-	 * the MMU and must instead generate an identity mapped
-	 * page table for all of the memory.
-	 *
-	 * At worst this runs in O(N) of the image size.
-	 */
-	struct list_head extra_pages;
-	struct page *pages;
-	unsigned int count;
-
-	count = 1 << order;
-	INIT_LIST_HEAD(&extra_pages);
-
-	/* Loop while I can allocate a page and the page allocated
-	 * is a destination page.
-	 */
-	do {
-		unsigned long pfn, epfn, addr, eaddr;
-
-		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
-		if (!pages)
-			break;
-		pfn   = page_to_pfn(pages);
-		epfn  = pfn + count;
-		addr  = pfn << PAGE_SHIFT;
-		eaddr = epfn << PAGE_SHIFT;
-		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
-			      kimage_is_destination_range(image, addr, eaddr)) {
-			list_add(&pages->lru, &extra_pages);
-			pages = NULL;
-		}
-	} while (!pages);
-
-	if (pages) {
-		/* Remember the allocated page... */
-		list_add(&pages->lru, &image->control_pages);
-
-		/* Because the page is already in it's destination
-		 * location we will never allocate another page at
-		 * that address.  Therefore kimage_alloc_pages
-		 * will not return it (again) and we don't need
-		 * to give it an entry in image->segment[].
-		 */
-	}
-	/* Deal with the destination pages I have inadvertently allocated.
-	 *
-	 * Ideally I would convert multi-page allocations into single
-	 * page allocations, and add everything to image->dest_pages.
-	 *
-	 * For now it is simpler to just free the pages.
-	 */
-	kimage_free_page_list(&extra_pages);
-
-	return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
-						      unsigned int order)
-{
-	/* Control pages are special, they are the intermediaries
-	 * that are needed while we copy the rest of the pages
-	 * to their final resting place.  As such they must
-	 * not conflict with either the destination addresses
-	 * or memory the kernel is already using.
-	 *
-	 * Control pages are also the only pags we must allocate
-	 * when loading a crash kernel.  All of the other pages
-	 * are specified by the segments and we just memcpy
-	 * into them directly.
-	 *
-	 * The only case where we really need more than one of
-	 * these are for architectures where we cannot disable
-	 * the MMU and must instead generate an identity mapped
-	 * page table for all of the memory.
-	 *
-	 * Given the low demand this implements a very simple
-	 * allocator that finds the first hole of the appropriate
-	 * size in the reserved memory region, and allocates all
-	 * of the memory up to and including the hole.
-	 */
-	unsigned long hole_start, hole_end, size;
-	struct page *pages;
-
-	pages = NULL;
-	size = (1 << order) << PAGE_SHIFT;
-	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
-	hole_end   = hole_start + size - 1;
-	while (hole_end <= crashk_res.end) {
-		unsigned long i;
-
-		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
-			break;
-		/* See if I overlap any of the segments */
-		for (i = 0; i < image->nr_segments; i++) {
-			unsigned long mstart, mend;
-
-			mstart = image->segment[i].mem;
-			mend   = mstart + image->segment[i].memsz - 1;
-			if ((hole_end >= mstart) && (hole_start <= mend)) {
-				/* Advance the hole to the end of the segment */
-				hole_start = (mend + (size - 1)) & ~(size - 1);
-				hole_end   = hole_start + size - 1;
-				break;
-			}
-		}
-		/* If I don't overlap any segments I have found my hole! */
-		if (i == image->nr_segments) {
-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
-			break;
-		}
-	}
-	if (pages)
-		image->control_page = hole_end;
-
-	return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
-					 unsigned int order)
-{
-	struct page *pages = NULL;
-
-	switch (image->type) {
-	case KEXEC_TYPE_DEFAULT:
-		pages = kimage_alloc_normal_control_pages(image, order);
-		break;
-	case KEXEC_TYPE_CRASH:
-		pages = kimage_alloc_crash_control_pages(image, order);
-		break;
-	}
-
-	return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
-	if (*image->entry != 0)
-		image->entry++;
-
-	if (image->entry == image->last_entry) {
-		kimage_entry_t *ind_page;
-		struct page *page;
-
-		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
-		if (!page)
-			return -ENOMEM;
-
-		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
-		image->entry = ind_page;
-		image->last_entry = ind_page +
-				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
-	}
-	*image->entry = entry;
-	image->entry++;
-	*image->entry = 0;
-
-	return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
-				   unsigned long destination)
-{
-	int result;
-
-	destination &= PAGE_MASK;
-	result = kimage_add_entry(image, destination | IND_DESTINATION);
-
-	return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
-	int result;
-
-	page &= PAGE_MASK;
-	result = kimage_add_entry(image, page | IND_SOURCE);
-
-	return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
-	/* Walk through and free any extra destination pages I may have */
-	kimage_free_page_list(&image->dest_pages);
-
-	/* Walk through and free any unusable pages I have cached */
-	kimage_free_page_list(&image->unusable_pages);
-
-}
-void kimage_terminate(struct kimage *image)
-{
-	if (*image->entry != 0)
-		image->entry++;
-
-	*image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
-	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
-		ptr = (entry & IND_INDIRECTION) ? \
-			phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
-	struct page *page;
-
-	page = pfn_to_page(entry >> PAGE_SHIFT);
-	kimage_free_pages(page);
-}
-
-void kimage_free(struct kimage *image)
-{
-	kimage_entry_t *ptr, entry;
-	kimage_entry_t ind = 0;
-
-	if (!image)
-		return;
-
-	kimage_free_extra_pages(image);
-	for_each_kimage_entry(image, ptr, entry) {
-		if (entry & IND_INDIRECTION) {
-			/* Free the previous indirection page */
-			if (ind & IND_INDIRECTION)
-				kimage_free_entry(ind);
-			/* Save this indirection page until we are
-			 * done with it.
-			 */
-			ind = entry;
-		} else if (entry & IND_SOURCE)
-			kimage_free_entry(entry);
-	}
-	/* Free the final indirection page */
-	if (ind & IND_INDIRECTION)
-		kimage_free_entry(ind);
-
-	/* Handle any machine specific cleanup */
-	machine_kexec_cleanup(image);
-
-	/* Free the kexec control pages... */
-	kimage_free_page_list(&image->control_pages);
-
-	/*
-	 * Free up any temporary buffers allocated. This might hit if
-	 * error occurred much later after buffer allocation.
-	 */
-	if (image->file_mode)
-		kimage_file_post_load_cleanup(image);
-
-	kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
-					unsigned long page)
-{
-	kimage_entry_t *ptr, entry;
-	unsigned long destination = 0;
-
-	for_each_kimage_entry(image, ptr, entry) {
-		if (entry & IND_DESTINATION)
-			destination = entry & PAGE_MASK;
-		else if (entry & IND_SOURCE) {
-			if (page == destination)
-				return ptr;
-			destination += PAGE_SIZE;
-		}
-	}
-
-	return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
-					gfp_t gfp_mask,
-					unsigned long destination)
-{
-	/*
-	 * Here we implement safeguards to ensure that a source page
-	 * is not copied to its destination page before the data on
-	 * the destination page is no longer useful.
-	 *
-	 * To do this we maintain the invariant that a source page is
-	 * either its own destination page, or it is not a
-	 * destination page at all.
-	 *
-	 * That is slightly stronger than required, but the proof
-	 * that no problems will not occur is trivial, and the
-	 * implementation is simply to verify.
-	 *
-	 * When allocating all pages normally this algorithm will run
-	 * in O(N) time, but in the worst case it will run in O(N^2)
-	 * time.   If the runtime is a problem the data structures can
-	 * be fixed.
-	 */
-	struct page *page;
-	unsigned long addr;
-
-	/*
-	 * Walk through the list of destination pages, and see if I
-	 * have a match.
-	 */
-	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
-		if (addr == destination) {
-			list_del(&page->lru);
-			return page;
-		}
-	}
-	page = NULL;
-	while (1) {
-		kimage_entry_t *old;
-
-		/* Allocate a page, if we run out of memory give up */
-		page = kimage_alloc_pages(gfp_mask, 0);
-		if (!page)
-			return NULL;
-		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
-				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
-			list_add(&page->lru, &image->unusable_pages);
-			continue;
-		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
-
-		/* If it is the destination page we want use it */
-		if (addr == destination)
-			break;
-
-		/* If the page is not a destination page use it */
-		if (!kimage_is_destination_range(image, addr,
-						  addr + PAGE_SIZE))
-			break;
-
-		/*
-		 * I know that the page is someones destination page.
-		 * See if there is already a source page for this
-		 * destination page.  And if so swap the source pages.
-		 */
-		old = kimage_dst_used(image, addr);
-		if (old) {
-			/* If so move it */
-			unsigned long old_addr;
-			struct page *old_page;
-
-			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
-			copy_highpage(page, old_page);
-			*old = addr | (*old & ~PAGE_MASK);
-
-			/* The old page I have found cannot be a
-			 * destination page, so return it if it's
-			 * gfp_flags honor the ones passed in.
-			 */
-			if (!(gfp_mask & __GFP_HIGHMEM) &&
-			    PageHighMem(old_page)) {
-				kimage_free_pages(old_page);
-				continue;
-			}
-			addr = old_addr;
-			page = old_page;
-			break;
-		} else {
-			/* Place the page on the destination list I
-			 * will use it later.
-			 */
-			list_add(&page->lru, &image->dest_pages);
-		}
-	}
-
-	return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
-					 struct kexec_segment *segment)
-{
-	unsigned long maddr;
-	size_t ubytes, mbytes;
-	int result;
-	unsigned char __user *buf = NULL;
-	unsigned char *kbuf = NULL;
-
-	result = 0;
-	if (image->file_mode)
-		kbuf = segment->kbuf;
-	else
-		buf = segment->buf;
-	ubytes = segment->bufsz;
-	mbytes = segment->memsz;
-	maddr = segment->mem;
-
-	result = kimage_set_destination(image, maddr);
-	if (result < 0)
-		goto out;
-
-	while (mbytes) {
-		struct page *page;
-		char *ptr;
-		size_t uchunk, mchunk;
-
-		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
-		if (!page) {
-			result  = -ENOMEM;
-			goto out;
-		}
-		result = kimage_add_page(image, page_to_pfn(page)
-								<< PAGE_SHIFT);
-		if (result < 0)
-			goto out;
-
-		ptr = kmap(page);
-		/* Start with a clear page */
-		clear_page(ptr);
-		ptr += maddr & ~PAGE_MASK;
-		mchunk = min_t(size_t, mbytes,
-				PAGE_SIZE - (maddr & ~PAGE_MASK));
-		uchunk = min(ubytes, mchunk);
-
-		/* For file based kexec, source pages are in kernel memory */
-		if (image->file_mode)
-			memcpy(ptr, kbuf, uchunk);
-		else
-			result = copy_from_user(ptr, buf, uchunk);
-		kunmap(page);
-		if (result) {
-			result = -EFAULT;
-			goto out;
-		}
-		ubytes -= uchunk;
-		maddr  += mchunk;
-		if (image->file_mode)
-			kbuf += mchunk;
-		else
-			buf += mchunk;
-		mbytes -= mchunk;
-	}
-out:
-	return result;
-}
-
-static int kimage_load_crash_segment(struct kimage *image,
-					struct kexec_segment *segment)
-{
-	/* For crash dumps kernels we simply copy the data from
-	 * user space to it's destination.
-	 * We do things a page at a time for the sake of kmap.
-	 */
-	unsigned long maddr;
-	size_t ubytes, mbytes;
-	int result;
-	unsigned char __user *buf = NULL;
-	unsigned char *kbuf = NULL;
-
-	result = 0;
-	if (image->file_mode)
-		kbuf = segment->kbuf;
-	else
-		buf = segment->buf;
-	ubytes = segment->bufsz;
-	mbytes = segment->memsz;
-	maddr = segment->mem;
-	while (mbytes) {
-		struct page *page;
-		char *ptr;
-		size_t uchunk, mchunk;
-
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
-		if (!page) {
-			result  = -ENOMEM;
-			goto out;
-		}
-		ptr = kmap(page);
-		ptr += maddr & ~PAGE_MASK;
-		mchunk = min_t(size_t, mbytes,
-				PAGE_SIZE - (maddr & ~PAGE_MASK));
-		uchunk = min(ubytes, mchunk);
-		if (mchunk > uchunk) {
-			/* Zero the trailing part of the page */
-			memset(ptr + uchunk, 0, mchunk - uchunk);
-		}
-
-		/* For file based kexec, source pages are in kernel memory */
-		if (image->file_mode)
-			memcpy(ptr, kbuf, uchunk);
-		else
-			result = copy_from_user(ptr, buf, uchunk);
-		kexec_flush_icache_page(page);
-		kunmap(page);
-		if (result) {
-			result = -EFAULT;
-			goto out;
-		}
-		ubytes -= uchunk;
-		maddr  += mchunk;
-		if (image->file_mode)
-			kbuf += mchunk;
-		else
-			buf += mchunk;
-		mbytes -= mchunk;
-	}
-out:
-	return result;
-}
-
-int kimage_load_segment(struct kimage *image,
-				struct kexec_segment *segment)
-{
-	int result = -ENOMEM;
-
-	switch (image->type) {
-	case KEXEC_TYPE_DEFAULT:
-		result = kimage_load_normal_segment(image, segment);
-		break;
-	case KEXEC_TYPE_CRASH:
-		result = kimage_load_crash_segment(image, segment);
-		break;
-	}
-
-	return result;
-}
-
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
  *
@@ -954,9 +121,6 @@ int kimage_load_segment(struct kimage *image,
  * kexec does not sync, or unmount filesystems so if you need
  * that to happen you need to do that yourself.
  */
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
 
 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		struct kexec_segment __user *, segments, unsigned long, flags)
@@ -1051,18 +215,6 @@ out:
 	return result;
 }
 
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 		       compat_ulong_t, nr_segments,
@@ -1101,646 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 	return sys_kexec_load(entry, nr_segments, ksegments, flags);
 }
 #endif
-
-void crash_kexec(struct pt_regs *regs)
-{
-	/* Take the kexec_mutex here to prevent sys_kexec_load
-	 * running on one cpu from replacing the crash kernel
-	 * we are using after a panic on a different cpu.
-	 *
-	 * If the crash kernel was not located in a fixed area
-	 * of memory the xchg(&kexec_crash_image) would be
-	 * sufficient.  But since I reuse the memory...
-	 */
-	if (mutex_trylock(&kexec_mutex)) {
-		if (kexec_crash_image) {
-			struct pt_regs fixed_regs;
-
-			crash_setup_regs(&fixed_regs, regs);
-			crash_save_vmcoreinfo();
-			machine_crash_shutdown(&fixed_regs);
-			machine_kexec(kexec_crash_image);
-		}
-		mutex_unlock(&kexec_mutex);
-	}
-}
-
-size_t crash_get_memory_size(void)
-{
-	size_t size = 0;
-	mutex_lock(&kexec_mutex);
-	if (crashk_res.end != crashk_res.start)
-		size = resource_size(&crashk_res);
-	mutex_unlock(&kexec_mutex);
-	return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
-					   unsigned long end)
-{
-	unsigned long addr;
-
-	for (addr = begin; addr < end; addr += PAGE_SIZE)
-		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
-	int ret = 0;
-	unsigned long start, end;
-	unsigned long old_size;
-	struct resource *ram_res;
-
-	mutex_lock(&kexec_mutex);
-
-	if (kexec_crash_image) {
-		ret = -ENOENT;
-		goto unlock;
-	}
-	start = crashk_res.start;
-	end = crashk_res.end;
-	old_size = (end == 0) ? 0 : end - start + 1;
-	if (new_size >= old_size) {
-		ret = (new_size == old_size) ? 0 : -EINVAL;
-		goto unlock;
-	}
-
-	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-	if (!ram_res) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
-
-	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
-	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
-	crash_map_reserved_pages();
-	crash_free_reserved_phys_range(end, crashk_res.end);
-
-	if ((start == end) && (crashk_res.parent != NULL))
-		release_resource(&crashk_res);
-
-	ram_res->start = end;
-	ram_res->end = crashk_res.end;
-	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-	ram_res->name = "System RAM";
-
-	crashk_res.end = end - 1;
-
-	insert_resource(&iomem_resource, ram_res);
-	crash_unmap_reserved_pages();
-
-unlock:
-	mutex_unlock(&kexec_mutex);
-	return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-			    size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) + 3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= nr_cpu_ids))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that, so there is no need to invent something new.
-	 */
-	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
-	if (!buf)
-		return;
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
-	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-			      &prstatus, sizeof(prstatus));
-	final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
-	/* Allocate memory for saving cpu registers. */
-	crash_notes = alloc_percpu(note_buf_t);
-	if (!crash_notes) {
-		pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-					unsigned long long system_ram,
-					unsigned long long *crash_size,
-					unsigned long long *crash_base)
-{
-	char *cur = cmdline, *tmp;
-
-	/* for each entry of the comma-separated list */
-	do {
-		unsigned long long start, end = ULLONG_MAX, size;
-
-		/* get the start of the range */
-		start = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("crashkernel: Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (*cur != '-') {
-			pr_warn("crashkernel: '-' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		/* if no ':' is here, than we read the end */
-		if (*cur != ':') {
-			end = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("crashkernel: Memory value expected\n");
-				return -EINVAL;
-			}
-			cur = tmp;
-			if (end <= start) {
-				pr_warn("crashkernel: end <= start\n");
-				return -EINVAL;
-			}
-		}
-
-		if (*cur != ':') {
-			pr_warn("crashkernel: ':' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		size = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (size >= system_ram) {
-			pr_warn("crashkernel: invalid size\n");
-			return -EINVAL;
-		}
-
-		/* match ? */
-		if (system_ram >= start && system_ram < end) {
-			*crash_size = size;
-			break;
-		}
-	} while (*cur++ == ',');
-
-	if (*crash_size > 0) {
-		while (*cur && *cur != ' ' && *cur != '@')
-			cur++;
-		if (*cur == '@') {
-			cur++;
-			*crash_base = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("Memory value expected after '@'\n");
-				return -EINVAL;
-			}
-		}
-	}
-
-	return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *	crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-					   unsigned long long *crash_size,
-					   unsigned long long *crash_base)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	if (*cur == '@')
-		*crash_base = memparse(cur+1, &cur);
-	else if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-	[SUFFIX_HIGH] = ",high",
-	[SUFFIX_LOW]  = ",low",
-	[SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *	crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-					   unsigned long long	*crash_size,
-					   const char *suffix)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	/* check with suffix */
-	if (strncmp(cur, suffix, strlen(suffix))) {
-		pr_warn("crashkernel: unrecognized char\n");
-		return -EINVAL;
-	}
-	cur += strlen(suffix);
-	if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
-			     const char *name,
-			     const char *suffix)
-{
-	char *p = cmdline, *ck_cmdline = NULL;
-
-	/* find crashkernel and use the last one if there are more */
-	p = strstr(p, name);
-	while (p) {
-		char *end_p = strchr(p, ' ');
-		char *q;
-
-		if (!end_p)
-			end_p = p + strlen(p);
-
-		if (!suffix) {
-			int i;
-
-			/* skip the one with any known suffix */
-			for (i = 0; suffix_tbl[i]; i++) {
-				q = end_p - strlen(suffix_tbl[i]);
-				if (!strncmp(q, suffix_tbl[i],
-					     strlen(suffix_tbl[i])))
-					goto next;
-			}
-			ck_cmdline = p;
-		} else {
-			q = end_p - strlen(suffix);
-			if (!strncmp(q, suffix, strlen(suffix)))
-				ck_cmdline = p;
-		}
-next:
-		p = strstr(p+1, name);
-	}
-
-	if (!ck_cmdline)
-		return NULL;
-
-	return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base,
-			     const char *name,
-			     const char *suffix)
-{
-	char	*first_colon, *first_space;
-	char	*ck_cmdline;
-
-	BUG_ON(!crash_size || !crash_base);
-	*crash_size = 0;
-	*crash_base = 0;
-
-	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
-	if (!ck_cmdline)
-		return -EINVAL;
-
-	ck_cmdline += strlen(name);
-
-	if (suffix)
-		return parse_crashkernel_suffix(ck_cmdline, crash_size,
-				suffix);
-	/*
-	 * if the commandline contains a ':', then that's the extended
-	 * syntax -- if not, it must be the classic syntax
-	 */
-	first_colon = strchr(ck_cmdline, ':');
-	first_space = strchr(ck_cmdline, ' ');
-	if (first_colon && (!first_space || first_colon < first_space))
-		return parse_crashkernel_mem(ck_cmdline, system_ram,
-				crash_size, crash_base);
-
-	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-					"crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
-	u32 *buf = vmcoreinfo_note;
-
-	if (!vmcoreinfo_size)
-		return;
-	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-			      vmcoreinfo_size);
-	final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
-	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-	update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-	va_list args;
-	char buf[0x50];
-	size_t r;
-
-	va_start(args, fmt);
-	r = vscnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-
-	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
-	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
-	vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
-	return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
-	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-	VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
-	VMCOREINFO_SYMBOL(init_uts_ns);
-	VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-	VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
-	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-	VMCOREINFO_SYMBOL(mem_map);
-	VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-	VMCOREINFO_SYMBOL(mem_section);
-	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-	VMCOREINFO_STRUCT_SIZE(mem_section);
-	VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
-	VMCOREINFO_STRUCT_SIZE(page);
-	VMCOREINFO_STRUCT_SIZE(pglist_data);
-	VMCOREINFO_STRUCT_SIZE(zone);
-	VMCOREINFO_STRUCT_SIZE(free_area);
-	VMCOREINFO_STRUCT_SIZE(list_head);
-	VMCOREINFO_SIZE(nodemask_t);
-	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _count);
-	VMCOREINFO_OFFSET(page, mapping);
-	VMCOREINFO_OFFSET(page, lru);
-	VMCOREINFO_OFFSET(page, _mapcount);
-	VMCOREINFO_OFFSET(page, private);
-	VMCOREINFO_OFFSET(pglist_data, node_zones);
-	VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-	VMCOREINFO_OFFSET(pglist_data, node_id);
-	VMCOREINFO_OFFSET(zone, free_area);
-	VMCOREINFO_OFFSET(zone, vm_stat);
-	VMCOREINFO_OFFSET(zone, spanned_pages);
-	VMCOREINFO_OFFSET(free_area, free_list);
-	VMCOREINFO_OFFSET(list_head, next);
-	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_OFFSET(vmap_area, va_start);
-	VMCOREINFO_OFFSET(vmap_area, list);
-	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
-	log_buf_kexec_setup();
-	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-	VMCOREINFO_NUMBER(NR_FREE_PAGES);
-	VMCOREINFO_NUMBER(PG_lru);
-	VMCOREINFO_NUMBER(PG_private);
-	VMCOREINFO_NUMBER(PG_swapcache);
-	VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-	VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-	VMCOREINFO_NUMBER(PG_head_mask);
-	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
-	VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
-	arch_crash_save_vmcoreinfo();
-	update_vmcoreinfo_note();
-
-	return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable.  If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
-	int error = 0;
-
-	if (!mutex_trylock(&kexec_mutex))
-		return -EBUSY;
-	if (!kexec_image) {
-		error = -EINVAL;
-		goto Unlock;
-	}
-
-#ifdef CONFIG_KEXEC_JUMP
-	if (kexec_image->preserve_context) {
-		lock_system_sleep();
-		pm_prepare_console();
-		error = freeze_processes();
-		if (error) {
-			error = -EBUSY;
-			goto Restore_console;
-		}
-		suspend_console();
-		error = dpm_suspend_start(PMSG_FREEZE);
-		if (error)
-			goto Resume_console;
-		/* At this point, dpm_suspend_start() has been called,
-		 * but *not* dpm_suspend_end(). We *must* call
-		 * dpm_suspend_end() now.  Otherwise, drivers for
-		 * some devices (e.g. interrupt controllers) become
-		 * desynchronized with the actual state of the
-		 * hardware at resume time, and evil weirdness ensues.
-		 */
-		error = dpm_suspend_end(PMSG_FREEZE);
-		if (error)
-			goto Resume_devices;
-		error = disable_nonboot_cpus();
-		if (error)
-			goto Enable_cpus;
-		local_irq_disable();
-		error = syscore_suspend();
-		if (error)
-			goto Enable_irqs;
-	} else
-#endif
-	{
-		kexec_in_progress = true;
-		kernel_restart_prepare(NULL);
-		migrate_to_reboot_cpu();
-
-		/*
-		 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
-		 * no further code needs to use CPU hotplug (which is true in
-		 * the reboot case). However, the kexec path depends on using
-		 * CPU hotplug again; so re-enable it here.
-		 */
-		cpu_hotplug_enable();
-		pr_emerg("Starting new kernel\n");
-		machine_shutdown();
-	}
-
-	machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
-	if (kexec_image->preserve_context) {
-		syscore_resume();
- Enable_irqs:
-		local_irq_enable();
- Enable_cpus:
-		enable_nonboot_cpus();
-		dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
-		dpm_resume_end(PMSG_RESTORE);
- Resume_console:
-		resume_console();
-		thaw_processes();
- Restore_console:
-		pm_restore_console();
-		unlock_system_sleep();
-	}
-#endif
-
- Unlock:
-	mutex_unlock(&kexec_mutex);
-	return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..9aa25c034b2e
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1511 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt)	"kexec: " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+	/*
+	 * If crash_kexec_post_notifiers is enabled, don't run
+	 * crash_kexec() here yet, which must be run after panic
+	 * notifiers in panic().
+	 */
+	if (crash_kexec_post_notifiers)
+		return 0;
+	/*
+	 * There are 4 panic() calls in do_exit() path, each of which
+	 * corresponds to each of these 4 conditions.
+	 */
+	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+		return 1;
+	return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+				       gfp_t gfp_mask,
+				       unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+	int result, i;
+	unsigned long nr_segments = image->nr_segments;
+
+	/*
+	 * Verify we have good destination addresses.  The caller is
+	 * responsible for making certain we don't attempt to load
+	 * the new image into invalid or reserved areas of RAM.  This
+	 * just verifies it is an address we can use.
+	 *
+	 * Since the kernel does everything in page size chunks ensure
+	 * the destination addresses are page aligned.  Too many
+	 * special cases crop of when we don't do this.  The most
+	 * insidious is getting overlapping destination addresses
+	 * simply because addresses are changed to page size
+	 * granularity.
+	 */
+	result = -EADDRNOTAVAIL;
+	for (i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+			return result;
+		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+			return result;
+	}
+
+	/* Verify our destination addresses do not overlap.
+	 * If we alloed overlapping destination addresses
+	 * through very weird things can happen with no
+	 * easy explanation as one segment stops on another.
+	 */
+	result = -EINVAL;
+	for (i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+		unsigned long j;
+
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		for (j = 0; j < i; j++) {
+			unsigned long pstart, pend;
+
+			pstart = image->segment[j].mem;
+			pend   = pstart + image->segment[j].memsz;
+			/* Do the segments overlap ? */
+			if ((mend > pstart) && (mstart < pend))
+				return result;
+		}
+	}
+
+	/* Ensure our buffer sizes are strictly less than
+	 * our memory sizes.  This should always be the case,
+	 * and it is easier to check up front than to be surprised
+	 * later on.
+	 */
+	result = -EINVAL;
+	for (i = 0; i < nr_segments; i++) {
+		if (image->segment[i].bufsz > image->segment[i].memsz)
+			return result;
+	}
+
+	/*
+	 * Verify we have good destination addresses.  Normally
+	 * the caller is responsible for making certain we don't
+	 * attempt to load the new image into invalid or reserved
+	 * areas of RAM.  But crash kernels are preloaded into a
+	 * reserved area of ram.  We must ensure the addresses
+	 * are in the reserved area otherwise preloading the
+	 * kernel could corrupt things.
+	 */
+
+	if (image->type == KEXEC_TYPE_CRASH) {
+		result = -EADDRNOTAVAIL;
+		for (i = 0; i < nr_segments; i++) {
+			unsigned long mstart, mend;
+
+			mstart = image->segment[i].mem;
+			mend = mstart + image->segment[i].memsz - 1;
+			/* Ensure we are within the crash kernel limits */
+			if ((mstart < crashk_res.start) ||
+			    (mend > crashk_res.end))
+				return result;
+		}
+	}
+
+	return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+	struct kimage *image;
+
+	/* Allocate a controlling structure */
+	image = kzalloc(sizeof(*image), GFP_KERNEL);
+	if (!image)
+		return NULL;
+
+	image->head = 0;
+	image->entry = &image->head;
+	image->last_entry = &image->head;
+	image->control_page = ~0; /* By default this does not apply */
+	image->type = KEXEC_TYPE_DEFAULT;
+
+	/* Initialize the list of control pages */
+	INIT_LIST_HEAD(&image->control_pages);
+
+	/* Initialize the list of destination pages */
+	INIT_LIST_HEAD(&image->dest_pages);
+
+	/* Initialize the list of unusable pages */
+	INIT_LIST_HEAD(&image->unusable_pages);
+
+	return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+					unsigned long start,
+					unsigned long end)
+{
+	unsigned long i;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		unsigned long mstart, mend;
+
+		mstart = image->segment[i].mem;
+		mend = mstart + image->segment[i].memsz;
+		if ((end > mstart) && (start < mend))
+			return 1;
+	}
+
+	return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+	struct page *pages;
+
+	pages = alloc_pages(gfp_mask, order);
+	if (pages) {
+		unsigned int count, i;
+
+		pages->mapping = NULL;
+		set_page_private(pages, order);
+		count = 1 << order;
+		for (i = 0; i < count; i++)
+			SetPageReserved(pages + i);
+	}
+
+	return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+	unsigned int order, count, i;
+
+	order = page_private(page);
+	count = 1 << order;
+	for (i = 0; i < count; i++)
+		ClearPageReserved(page + i);
+	__free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+	struct list_head *pos, *next;
+
+	list_for_each_safe(pos, next, list) {
+		struct page *page;
+
+		page = list_entry(pos, struct page, lru);
+		list_del(&page->lru);
+		kimage_free_pages(page);
+	}
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+							unsigned int order)
+{
+	/* Control pages are special, they are the intermediaries
+	 * that are needed while we copy the rest of the pages
+	 * to their final resting place.  As such they must
+	 * not conflict with either the destination addresses
+	 * or memory the kernel is already using.
+	 *
+	 * The only case where we really need more than one of
+	 * these are for architectures where we cannot disable
+	 * the MMU and must instead generate an identity mapped
+	 * page table for all of the memory.
+	 *
+	 * At worst this runs in O(N) of the image size.
+	 */
+	struct list_head extra_pages;
+	struct page *pages;
+	unsigned int count;
+
+	count = 1 << order;
+	INIT_LIST_HEAD(&extra_pages);
+
+	/* Loop while I can allocate a page and the page allocated
+	 * is a destination page.
+	 */
+	do {
+		unsigned long pfn, epfn, addr, eaddr;
+
+		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+		if (!pages)
+			break;
+		pfn   = page_to_pfn(pages);
+		epfn  = pfn + count;
+		addr  = pfn << PAGE_SHIFT;
+		eaddr = epfn << PAGE_SHIFT;
+		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+			      kimage_is_destination_range(image, addr, eaddr)) {
+			list_add(&pages->lru, &extra_pages);
+			pages = NULL;
+		}
+	} while (!pages);
+
+	if (pages) {
+		/* Remember the allocated page... */
+		list_add(&pages->lru, &image->control_pages);
+
+		/* Because the page is already in it's destination
+		 * location we will never allocate another page at
+		 * that address.  Therefore kimage_alloc_pages
+		 * will not return it (again) and we don't need
+		 * to give it an entry in image->segment[].
+		 */
+	}
+	/* Deal with the destination pages I have inadvertently allocated.
+	 *
+	 * Ideally I would convert multi-page allocations into single
+	 * page allocations, and add everything to image->dest_pages.
+	 *
+	 * For now it is simpler to just free the pages.
+	 */
+	kimage_free_page_list(&extra_pages);
+
+	return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+						      unsigned int order)
+{
+	/* Control pages are special, they are the intermediaries
+	 * that are needed while we copy the rest of the pages
+	 * to their final resting place.  As such they must
+	 * not conflict with either the destination addresses
+	 * or memory the kernel is already using.
+	 *
+	 * Control pages are also the only pags we must allocate
+	 * when loading a crash kernel.  All of the other pages
+	 * are specified by the segments and we just memcpy
+	 * into them directly.
+	 *
+	 * The only case where we really need more than one of
+	 * these are for architectures where we cannot disable
+	 * the MMU and must instead generate an identity mapped
+	 * page table for all of the memory.
+	 *
+	 * Given the low demand this implements a very simple
+	 * allocator that finds the first hole of the appropriate
+	 * size in the reserved memory region, and allocates all
+	 * of the memory up to and including the hole.
+	 */
+	unsigned long hole_start, hole_end, size;
+	struct page *pages;
+
+	pages = NULL;
+	size = (1 << order) << PAGE_SHIFT;
+	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+	hole_end   = hole_start + size - 1;
+	while (hole_end <= crashk_res.end) {
+		unsigned long i;
+
+		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+			break;
+		/* See if I overlap any of the segments */
+		for (i = 0; i < image->nr_segments; i++) {
+			unsigned long mstart, mend;
+
+			mstart = image->segment[i].mem;
+			mend   = mstart + image->segment[i].memsz - 1;
+			if ((hole_end >= mstart) && (hole_start <= mend)) {
+				/* Advance the hole to the end of the segment */
+				hole_start = (mend + (size - 1)) & ~(size - 1);
+				hole_end   = hole_start + size - 1;
+				break;
+			}
+		}
+		/* If I don't overlap any segments I have found my hole! */
+		if (i == image->nr_segments) {
+			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			break;
+		}
+	}
+	if (pages)
+		image->control_page = hole_end;
+
+	return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+					 unsigned int order)
+{
+	struct page *pages = NULL;
+
+	switch (image->type) {
+	case KEXEC_TYPE_DEFAULT:
+		pages = kimage_alloc_normal_control_pages(image, order);
+		break;
+	case KEXEC_TYPE_CRASH:
+		pages = kimage_alloc_crash_control_pages(image, order);
+		break;
+	}
+
+	return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+	if (*image->entry != 0)
+		image->entry++;
+
+	if (image->entry == image->last_entry) {
+		kimage_entry_t *ind_page;
+		struct page *page;
+
+		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+		if (!page)
+			return -ENOMEM;
+
+		ind_page = page_address(page);
+		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		image->entry = ind_page;
+		image->last_entry = ind_page +
+				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+	}
+	*image->entry = entry;
+	image->entry++;
+	*image->entry = 0;
+
+	return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+				   unsigned long destination)
+{
+	int result;
+
+	destination &= PAGE_MASK;
+	result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+	return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+	int result;
+
+	page &= PAGE_MASK;
+	result = kimage_add_entry(image, page | IND_SOURCE);
+
+	return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+	/* Walk through and free any extra destination pages I may have */
+	kimage_free_page_list(&image->dest_pages);
+
+	/* Walk through and free any unusable pages I have cached */
+	kimage_free_page_list(&image->unusable_pages);
+
+}
+void kimage_terminate(struct kimage *image)
+{
+	if (*image->entry != 0)
+		image->entry++;
+
+	*image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+		ptr = (entry & IND_INDIRECTION) ? \
+			phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+	struct page *page;
+
+	page = pfn_to_page(entry >> PAGE_SHIFT);
+	kimage_free_pages(page);
+}
+
+void kimage_free(struct kimage *image)
+{
+	kimage_entry_t *ptr, entry;
+	kimage_entry_t ind = 0;
+
+	if (!image)
+		return;
+
+	kimage_free_extra_pages(image);
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_INDIRECTION) {
+			/* Free the previous indirection page */
+			if (ind & IND_INDIRECTION)
+				kimage_free_entry(ind);
+			/* Save this indirection page until we are
+			 * done with it.
+			 */
+			ind = entry;
+		} else if (entry & IND_SOURCE)
+			kimage_free_entry(entry);
+	}
+	/* Free the final indirection page */
+	if (ind & IND_INDIRECTION)
+		kimage_free_entry(ind);
+
+	/* Handle any machine specific cleanup */
+	machine_kexec_cleanup(image);
+
+	/* Free the kexec control pages... */
+	kimage_free_page_list(&image->control_pages);
+
+	/*
+	 * Free up any temporary buffers allocated. This might hit if
+	 * error occurred much later after buffer allocation.
+	 */
+	if (image->file_mode)
+		kimage_file_post_load_cleanup(image);
+
+	kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+					unsigned long page)
+{
+	kimage_entry_t *ptr, entry;
+	unsigned long destination = 0;
+
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_DESTINATION)
+			destination = entry & PAGE_MASK;
+		else if (entry & IND_SOURCE) {
+			if (page == destination)
+				return ptr;
+			destination += PAGE_SIZE;
+		}
+	}
+
+	return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+					gfp_t gfp_mask,
+					unsigned long destination)
+{
+	/*
+	 * Here we implement safeguards to ensure that a source page
+	 * is not copied to its destination page before the data on
+	 * the destination page is no longer useful.
+	 *
+	 * To do this we maintain the invariant that a source page is
+	 * either its own destination page, or it is not a
+	 * destination page at all.
+	 *
+	 * That is slightly stronger than required, but the proof
+	 * that no problems will not occur is trivial, and the
+	 * implementation is simply to verify.
+	 *
+	 * When allocating all pages normally this algorithm will run
+	 * in O(N) time, but in the worst case it will run in O(N^2)
+	 * time.   If the runtime is a problem the data structures can
+	 * be fixed.
+	 */
+	struct page *page;
+	unsigned long addr;
+
+	/*
+	 * Walk through the list of destination pages, and see if I
+	 * have a match.
+	 */
+	list_for_each_entry(page, &image->dest_pages, lru) {
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+		if (addr == destination) {
+			list_del(&page->lru);
+			return page;
+		}
+	}
+	page = NULL;
+	while (1) {
+		kimage_entry_t *old;
+
+		/* Allocate a page, if we run out of memory give up */
+		page = kimage_alloc_pages(gfp_mask, 0);
+		if (!page)
+			return NULL;
+		/* If the page cannot be used file it away */
+		if (page_to_pfn(page) >
+				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+			list_add(&page->lru, &image->unusable_pages);
+			continue;
+		}
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+
+		/* If it is the destination page we want use it */
+		if (addr == destination)
+			break;
+
+		/* If the page is not a destination page use it */
+		if (!kimage_is_destination_range(image, addr,
+						  addr + PAGE_SIZE))
+			break;
+
+		/*
+		 * I know that the page is someones destination page.
+		 * See if there is already a source page for this
+		 * destination page.  And if so swap the source pages.
+		 */
+		old = kimage_dst_used(image, addr);
+		if (old) {
+			/* If so move it */
+			unsigned long old_addr;
+			struct page *old_page;
+
+			old_addr = *old & PAGE_MASK;
+			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			copy_highpage(page, old_page);
+			*old = addr | (*old & ~PAGE_MASK);
+
+			/* The old page I have found cannot be a
+			 * destination page, so return it if it's
+			 * gfp_flags honor the ones passed in.
+			 */
+			if (!(gfp_mask & __GFP_HIGHMEM) &&
+			    PageHighMem(old_page)) {
+				kimage_free_pages(old_page);
+				continue;
+			}
+			addr = old_addr;
+			page = old_page;
+			break;
+		}
+		/* Place the page on the destination list, to be used later */
+		list_add(&page->lru, &image->dest_pages);
+	}
+
+	return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+					 struct kexec_segment *segment)
+{
+	unsigned long maddr;
+	size_t ubytes, mbytes;
+	int result;
+	unsigned char __user *buf = NULL;
+	unsigned char *kbuf = NULL;
+
+	result = 0;
+	if (image->file_mode)
+		kbuf = segment->kbuf;
+	else
+		buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+
+	result = kimage_set_destination(image, maddr);
+	if (result < 0)
+		goto out;
+
+	while (mbytes) {
+		struct page *page;
+		char *ptr;
+		size_t uchunk, mchunk;
+
+		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+		if (!page) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		result = kimage_add_page(image, page_to_pfn(page)
+								<< PAGE_SHIFT);
+		if (result < 0)
+			goto out;
+
+		ptr = kmap(page);
+		/* Start with a clear page */
+		clear_page(ptr);
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = min_t(size_t, mbytes,
+				PAGE_SIZE - (maddr & ~PAGE_MASK));
+		uchunk = min(ubytes, mchunk);
+
+		/* For file based kexec, source pages are in kernel memory */
+		if (image->file_mode)
+			memcpy(ptr, kbuf, uchunk);
+		else
+			result = copy_from_user(ptr, buf, uchunk);
+		kunmap(page);
+		if (result) {
+			result = -EFAULT;
+			goto out;
+		}
+		ubytes -= uchunk;
+		maddr  += mchunk;
+		if (image->file_mode)
+			kbuf += mchunk;
+		else
+			buf += mchunk;
+		mbytes -= mchunk;
+	}
+out:
+	return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+					struct kexec_segment *segment)
+{
+	/* For crash dumps kernels we simply copy the data from
+	 * user space to it's destination.
+	 * We do things a page at a time for the sake of kmap.
+	 */
+	unsigned long maddr;
+	size_t ubytes, mbytes;
+	int result;
+	unsigned char __user *buf = NULL;
+	unsigned char *kbuf = NULL;
+
+	result = 0;
+	if (image->file_mode)
+		kbuf = segment->kbuf;
+	else
+		buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+	while (mbytes) {
+		struct page *page;
+		char *ptr;
+		size_t uchunk, mchunk;
+
+		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		if (!page) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		ptr = kmap(page);
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = min_t(size_t, mbytes,
+				PAGE_SIZE - (maddr & ~PAGE_MASK));
+		uchunk = min(ubytes, mchunk);
+		if (mchunk > uchunk) {
+			/* Zero the trailing part of the page */
+			memset(ptr + uchunk, 0, mchunk - uchunk);
+		}
+
+		/* For file based kexec, source pages are in kernel memory */
+		if (image->file_mode)
+			memcpy(ptr, kbuf, uchunk);
+		else
+			result = copy_from_user(ptr, buf, uchunk);
+		kexec_flush_icache_page(page);
+		kunmap(page);
+		if (result) {
+			result = -EFAULT;
+			goto out;
+		}
+		ubytes -= uchunk;
+		maddr  += mchunk;
+		if (image->file_mode)
+			kbuf += mchunk;
+		else
+			buf += mchunk;
+		mbytes -= mchunk;
+	}
+out:
+	return result;
+}
+
+int kimage_load_segment(struct kimage *image,
+				struct kexec_segment *segment)
+{
+	int result = -ENOMEM;
+
+	switch (image->type) {
+	case KEXEC_TYPE_DEFAULT:
+		result = kimage_load_normal_segment(image, segment);
+		break;
+	case KEXEC_TYPE_CRASH:
+		result = kimage_load_crash_segment(image, segment);
+		break;
+	}
+
+	return result;
+}
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+void crash_kexec(struct pt_regs *regs)
+{
+	/* Take the kexec_mutex here to prevent sys_kexec_load
+	 * running on one cpu from replacing the crash kernel
+	 * we are using after a panic on a different cpu.
+	 *
+	 * If the crash kernel was not located in a fixed area
+	 * of memory the xchg(&kexec_crash_image) would be
+	 * sufficient.  But since I reuse the memory...
+	 */
+	if (mutex_trylock(&kexec_mutex)) {
+		if (kexec_crash_image) {
+			struct pt_regs fixed_regs;
+
+			crash_setup_regs(&fixed_regs, regs);
+			crash_save_vmcoreinfo();
+			machine_crash_shutdown(&fixed_regs);
+			machine_kexec(kexec_crash_image);
+		}
+		mutex_unlock(&kexec_mutex);
+	}
+}
+
+size_t crash_get_memory_size(void)
+{
+	size_t size = 0;
+
+	mutex_lock(&kexec_mutex);
+	if (crashk_res.end != crashk_res.start)
+		size = resource_size(&crashk_res);
+	mutex_unlock(&kexec_mutex);
+	return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+					   unsigned long end)
+{
+	unsigned long addr;
+
+	for (addr = begin; addr < end; addr += PAGE_SIZE)
+		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+	int ret = 0;
+	unsigned long start, end;
+	unsigned long old_size;
+	struct resource *ram_res;
+
+	mutex_lock(&kexec_mutex);
+
+	if (kexec_crash_image) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+	start = crashk_res.start;
+	end = crashk_res.end;
+	old_size = (end == 0) ? 0 : end - start + 1;
+	if (new_size >= old_size) {
+		ret = (new_size == old_size) ? 0 : -EINVAL;
+		goto unlock;
+	}
+
+	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+	if (!ram_res) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+	crash_map_reserved_pages();
+	crash_free_reserved_phys_range(end, crashk_res.end);
+
+	if ((start == end) && (crashk_res.parent != NULL))
+		release_resource(&crashk_res);
+
+	ram_res->start = end;
+	ram_res->end = crashk_res.end;
+	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+	ram_res->name = "System RAM";
+
+	crashk_res.end = end - 1;
+
+	insert_resource(&iomem_resource, ram_res);
+	crash_unmap_reserved_pages();
+
+unlock:
+	mutex_unlock(&kexec_mutex);
+	return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+			    size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= nr_cpu_ids))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that, so there is no need to invent something new.
+	 */
+	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+			      &prstatus, sizeof(prstatus));
+	final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+	/* Allocate memory for saving cpu registers. */
+	crash_notes = alloc_percpu(note_buf_t);
+	if (!crash_notes) {
+		pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+					unsigned long long system_ram,
+					unsigned long long *crash_size,
+					unsigned long long *crash_base)
+{
+	char *cur = cmdline, *tmp;
+
+	/* for each entry of the comma-separated list */
+	do {
+		unsigned long long start, end = ULLONG_MAX, size;
+
+		/* get the start of the range */
+		start = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("crashkernel: Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (*cur != '-') {
+			pr_warn("crashkernel: '-' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		/* if no ':' is here, than we read the end */
+		if (*cur != ':') {
+			end = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("crashkernel: Memory value expected\n");
+				return -EINVAL;
+			}
+			cur = tmp;
+			if (end <= start) {
+				pr_warn("crashkernel: end <= start\n");
+				return -EINVAL;
+			}
+		}
+
+		if (*cur != ':') {
+			pr_warn("crashkernel: ':' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		size = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (size >= system_ram) {
+			pr_warn("crashkernel: invalid size\n");
+			return -EINVAL;
+		}
+
+		/* match ? */
+		if (system_ram >= start && system_ram < end) {
+			*crash_size = size;
+			break;
+		}
+	} while (*cur++ == ',');
+
+	if (*crash_size > 0) {
+		while (*cur && *cur != ' ' && *cur != '@')
+			cur++;
+		if (*cur == '@') {
+			cur++;
+			*crash_base = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("Memory value expected after '@'\n");
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *	crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+					   unsigned long long *crash_size,
+					   unsigned long long *crash_base)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	if (*cur == '@')
+		*crash_base = memparse(cur+1, &cur);
+	else if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+	[SUFFIX_HIGH] = ",high",
+	[SUFFIX_LOW]  = ",low",
+	[SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *	crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+					   unsigned long long	*crash_size,
+					   const char *suffix)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	/* check with suffix */
+	if (strncmp(cur, suffix, strlen(suffix))) {
+		pr_warn("crashkernel: unrecognized char\n");
+		return -EINVAL;
+	}
+	cur += strlen(suffix);
+	if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+			     const char *name,
+			     const char *suffix)
+{
+	char *p = cmdline, *ck_cmdline = NULL;
+
+	/* find crashkernel and use the last one if there are more */
+	p = strstr(p, name);
+	while (p) {
+		char *end_p = strchr(p, ' ');
+		char *q;
+
+		if (!end_p)
+			end_p = p + strlen(p);
+
+		if (!suffix) {
+			int i;
+
+			/* skip the one with any known suffix */
+			for (i = 0; suffix_tbl[i]; i++) {
+				q = end_p - strlen(suffix_tbl[i]);
+				if (!strncmp(q, suffix_tbl[i],
+					     strlen(suffix_tbl[i])))
+					goto next;
+			}
+			ck_cmdline = p;
+		} else {
+			q = end_p - strlen(suffix);
+			if (!strncmp(q, suffix, strlen(suffix)))
+				ck_cmdline = p;
+		}
+next:
+		p = strstr(p+1, name);
+	}
+
+	if (!ck_cmdline)
+		return NULL;
+
+	return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base,
+			     const char *name,
+			     const char *suffix)
+{
+	char	*first_colon, *first_space;
+	char	*ck_cmdline;
+
+	BUG_ON(!crash_size || !crash_base);
+	*crash_size = 0;
+	*crash_base = 0;
+
+	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+	if (!ck_cmdline)
+		return -EINVAL;
+
+	ck_cmdline += strlen(name);
+
+	if (suffix)
+		return parse_crashkernel_suffix(ck_cmdline, crash_size,
+				suffix);
+	/*
+	 * if the commandline contains a ':', then that's the extended
+	 * syntax -- if not, it must be the classic syntax
+	 */
+	first_colon = strchr(ck_cmdline, ':');
+	first_space = strchr(ck_cmdline, ' ');
+	if (first_colon && (!first_space || first_colon < first_space))
+		return parse_crashkernel_mem(ck_cmdline, system_ram,
+				crash_size, crash_base);
+
+	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+					"crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+	u32 *buf = vmcoreinfo_note;
+
+	if (!vmcoreinfo_size)
+		return;
+	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+			      vmcoreinfo_size);
+	final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+	update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+	va_list args;
+	char buf[0x50];
+	size_t r;
+
+	va_start(args, fmt);
+	r = vscnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+	vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+	return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+	VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+	VMCOREINFO_SYMBOL(init_uts_ns);
+	VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+	VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+	VMCOREINFO_SYMBOL(_stext);
+	VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+	VMCOREINFO_SYMBOL(mem_map);
+	VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+	VMCOREINFO_SYMBOL(mem_section);
+	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+	VMCOREINFO_STRUCT_SIZE(mem_section);
+	VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+	VMCOREINFO_STRUCT_SIZE(page);
+	VMCOREINFO_STRUCT_SIZE(pglist_data);
+	VMCOREINFO_STRUCT_SIZE(zone);
+	VMCOREINFO_STRUCT_SIZE(free_area);
+	VMCOREINFO_STRUCT_SIZE(list_head);
+	VMCOREINFO_SIZE(nodemask_t);
+	VMCOREINFO_OFFSET(page, flags);
+	VMCOREINFO_OFFSET(page, _count);
+	VMCOREINFO_OFFSET(page, mapping);
+	VMCOREINFO_OFFSET(page, lru);
+	VMCOREINFO_OFFSET(page, _mapcount);
+	VMCOREINFO_OFFSET(page, private);
+	VMCOREINFO_OFFSET(pglist_data, node_zones);
+	VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+	VMCOREINFO_OFFSET(pglist_data, node_id);
+	VMCOREINFO_OFFSET(zone, free_area);
+	VMCOREINFO_OFFSET(zone, vm_stat);
+	VMCOREINFO_OFFSET(zone, spanned_pages);
+	VMCOREINFO_OFFSET(free_area, free_list);
+	VMCOREINFO_OFFSET(list_head, next);
+	VMCOREINFO_OFFSET(list_head, prev);
+	VMCOREINFO_OFFSET(vmap_area, va_start);
+	VMCOREINFO_OFFSET(vmap_area, list);
+	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	log_buf_kexec_setup();
+	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+	VMCOREINFO_NUMBER(NR_FREE_PAGES);
+	VMCOREINFO_NUMBER(PG_lru);
+	VMCOREINFO_NUMBER(PG_private);
+	VMCOREINFO_NUMBER(PG_swapcache);
+	VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+	VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+	VMCOREINFO_NUMBER(PG_head_mask);
+	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLBFS
+	VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+	arch_crash_save_vmcoreinfo();
+	update_vmcoreinfo_note();
+
+	return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+	int error = 0;
+
+	if (!mutex_trylock(&kexec_mutex))
+		return -EBUSY;
+	if (!kexec_image) {
+		error = -EINVAL;
+		goto Unlock;
+	}
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context) {
+		lock_system_sleep();
+		pm_prepare_console();
+		error = freeze_processes();
+		if (error) {
+			error = -EBUSY;
+			goto Restore_console;
+		}
+		suspend_console();
+		error = dpm_suspend_start(PMSG_FREEZE);
+		if (error)
+			goto Resume_console;
+		/* At this point, dpm_suspend_start() has been called,
+		 * but *not* dpm_suspend_end(). We *must* call
+		 * dpm_suspend_end() now.  Otherwise, drivers for
+		 * some devices (e.g. interrupt controllers) become
+		 * desynchronized with the actual state of the
+		 * hardware at resume time, and evil weirdness ensues.
+		 */
+		error = dpm_suspend_end(PMSG_FREEZE);
+		if (error)
+			goto Resume_devices;
+		error = disable_nonboot_cpus();
+		if (error)
+			goto Enable_cpus;
+		local_irq_disable();
+		error = syscore_suspend();
+		if (error)
+			goto Enable_irqs;
+	} else
+#endif
+	{
+		kexec_in_progress = true;
+		kernel_restart_prepare(NULL);
+		migrate_to_reboot_cpu();
+
+		/*
+		 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+		 * no further code needs to use CPU hotplug (which is true in
+		 * the reboot case). However, the kexec path depends on using
+		 * CPU hotplug again; so re-enable it here.
+		 */
+		cpu_hotplug_enable();
+		pr_emerg("Starting new kernel\n");
+		machine_shutdown();
+	}
+
+	machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context) {
+		syscore_resume();
+ Enable_irqs:
+		local_irq_enable();
+ Enable_cpus:
+		enable_nonboot_cpus();
+		dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+		dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+		resume_console();
+		thaw_processes();
+ Restore_console:
+		pm_restore_console();
+		unlock_system_sleep();
+	}
+#endif
+
+ Unlock:
+	mutex_unlock(&kexec_mutex);
+	return error;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..e83b26464061 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
 KERNEL_ATTR_RW(profiling);
 #endif
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static ssize_t kexec_loaded_show(struct kobject *kobj,
 				 struct kobj_attribute *attr, char *buf)
 {
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_PROFILING
 	&profiling_attr.attr,
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	&kexec_loaded_attr.attr,
 	&kexec_crash_loaded_attr.attr,
 	&kexec_crash_size_attr.attr,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c24203368..8f0324ef72ab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
 	.release = devkmsg_release,
 };
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This appends the listed symbols to /proc/vmcore
  *
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..bd30a973fe94 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		kernel_restart(buffer);
 		break;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	case LINUX_REBOOT_CMD_KEXEC:
 		ret = kernel_kexec();
 		break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 19b62b522158..715cc57cc66a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	{
 		.procname	= "kexec_load_disabled",
 		.data		= &kexec_load_disabled,
-- 
cgit v1.2.3-70-g09d2


From 6894258eda2f9badc28c878086c0e54bd5b7fb30 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 Sep 2015 15:39:39 -0700
Subject: dma-mapping: consolidate dma_{alloc,free}_{attrs,coherent}

Since 2009 we have a nice asm-generic header implementing lots of DMA API
functions for architectures using struct dma_map_ops, but unfortunately
it's still missing a lot of APIs that all architectures still have to
duplicate.

This series consolidates the remaining functions, although we still need
arch opt outs for two of them as a few architectures have very
non-standard implementations.

This patch (of 5):

The coherent DMA allocator works the same over all architectures supporting
dma_map operations.

This patch consolidates them and converges the minor differences:

 - the debug_dma helpers are now called from all architectures, including
   those that were previously missing them
 - dma_alloc_from_coherent and dma_release_from_coherent are now always
   called from the generic alloc/free routines instead of the ops
   dma-mapping-common.h always includes dma-coherent.h to get the defintions
   for them, or the stubs if the architecture doesn't support this feature
 - checks for ->alloc / ->free presence are removed.  There is only one
   magic instead of dma_map_ops without them (mic_dma_ops) and that one
   is x86 only anyway.

Besides that only x86 needs special treatment to replace a default devices
if none is passed and tweak the gfp_flags.  An optional arch hook is provided
for that.

[linux@roeck-us.net: fix build]
[jcmvbkbc@gmail.com: fix xtensa]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/dma-mapping.h      | 18 ----------
 arch/arm/include/asm/dma-mapping.h        | 29 ----------------
 arch/arm/mm/dma-mapping.c                 | 12 -------
 arch/arm64/include/asm/dma-mapping.h      | 33 ------------------
 arch/h8300/include/asm/dma-mapping.h      | 26 --------------
 arch/hexagon/include/asm/dma-mapping.h    | 33 ------------------
 arch/ia64/include/asm/dma-mapping.h       | 25 -------------
 arch/microblaze/include/asm/dma-mapping.h | 31 -----------------
 arch/mips/cavium-octeon/dma-octeon.c      |  8 -----
 arch/mips/include/asm/dma-mapping.h       | 31 -----------------
 arch/mips/loongson64/common/dma-swiotlb.c |  8 -----
 arch/mips/mm/dma-default.c                |  7 ----
 arch/mips/netlogic/common/nlm-dma.c       | 10 ------
 arch/openrisc/include/asm/dma-mapping.h   | 30 ----------------
 arch/powerpc/include/asm/dma-mapping.h    | 33 ------------------
 arch/s390/include/asm/dma-mapping.h       | 31 -----------------
 arch/sh/include/asm/dma-mapping.h         | 37 --------------------
 arch/sparc/include/asm/dma-mapping.h      | 26 --------------
 arch/tile/include/asm/dma-mapping.h       | 27 --------------
 arch/unicore32/include/asm/dma-mapping.h  | 24 -------------
 arch/x86/include/asm/dma-mapping.h        | 16 ++-------
 arch/x86/kernel/pci-dma.c                 | 49 +++++---------------------
 arch/xtensa/include/asm/dma-mapping.h     | 31 -----------------
 drivers/xen/swiotlb-xen.c                 |  6 ----
 include/asm-generic/dma-mapping-common.h  | 58 +++++++++++++++++++++++++++++++
 25 files changed, 70 insertions(+), 569 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index dfa32f061320..9fef5bd59a82 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -12,24 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t gfp,
-				    struct dma_attrs *attrs)
-{
-	return get_dma_ops(dev)->alloc(dev, size, dma_handle, gfp, attrs);
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *vaddr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	get_dma_ops(dev)->free(dev, size, vaddr, dma_handle, attrs);
-}
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return get_dma_ops(dev)->mapping_error(dev, dma_addr);
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index a68b9d8a71fe..bc404473f1ca 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -8,7 +8,6 @@
 #include <linux/dma-attrs.h>
 #include <linux/dma-debug.h>
 
-#include <asm-generic/dma-coherent.h>
 #include <asm/memory.h>
 
 #include <xen/xen.h>
@@ -209,21 +208,6 @@ extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
 extern void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 			   gfp_t gfp, struct dma_attrs *attrs);
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				       dma_addr_t *dma_handle, gfp_t flag,
-				       struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *cpu_addr;
-	BUG_ON(!ops);
-
-	cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-	return cpu_addr;
-}
-
 /**
  * arm_dma_free - free memory allocated by arm_dma_alloc
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -241,19 +225,6 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 extern void arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
 			 dma_addr_t handle, struct dma_attrs *attrs);
 
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				     void *cpu_addr, dma_addr_t dma_handle,
-				     struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	BUG_ON(!ops);
-
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 /**
  * arm_dma_mmap - map a coherent DMA allocation into user space
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index bf35abcc7d59..e62604384945 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -676,10 +676,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 		    gfp_t gfp, struct dma_attrs *attrs)
 {
 	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
-	void *memory;
-
-	if (dma_alloc_from_coherent(dev, size, handle, &memory))
-		return memory;
 
 	return __dma_alloc(dev, size, handle, gfp, prot, false,
 			   attrs, __builtin_return_address(0));
@@ -688,11 +684,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
 	dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-	void *memory;
-
-	if (dma_alloc_from_coherent(dev, size, handle, &memory))
-		return memory;
-
 	return __dma_alloc(dev, size, handle, gfp, PAGE_KERNEL, true,
 			   attrs, __builtin_return_address(0));
 }
@@ -752,9 +743,6 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
 	struct page *page = pfn_to_page(dma_to_pfn(dev, handle));
 	bool want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs);
 
-	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
-		return;
-
 	size = PAGE_ALIGN(size);
 
 	if (nommu()) {
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index f0d6d0bfe55c..5e11b3f0fe3a 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -22,8 +22,6 @@
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 
-#include <asm-generic/dma-coherent.h>
-
 #include <xen/xen.h>
 #include <asm/xen/hypervisor.h>
 
@@ -120,37 +118,6 @@ static inline void dma_mark_clean(void *addr, size_t size)
 {
 }
 
-#define dma_alloc_coherent(d, s, h, f)	dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, h, f)	dma_free_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flags,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *vaddr;
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &vaddr))
-		return vaddr;
-
-	vaddr = ops->alloc(dev, size, dma_handle, flags, attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, vaddr);
-	return vaddr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *vaddr, dma_addr_t dev_addr,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (dma_release_from_coherent(dev, get_order(size), vaddr))
-		return;
-
-	debug_dma_free_coherent(dev, size, vaddr, dev_addr);
-	ops->free(dev, size, vaddr, dev_addr, attrs);
-}
-
 /*
  * There is no dma_cache_sync() implementation, so just return NULL here.
  */
diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h
index 6e67a90902f2..826aa9b519b7 100644
--- a/arch/h8300/include/asm/dma-mapping.h
+++ b/arch/h8300/include/asm/dma-mapping.h
@@ -1,8 +1,6 @@
 #ifndef _H8300_DMA_MAPPING_H
 #define _H8300_DMA_MAPPING_H
 
-#include <asm-generic/dma-coherent.h>
-
 extern struct dma_map_ops h8300_dma_map_ops;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
@@ -25,30 +23,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flag,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *memory;
-
-	memory = ops->alloc(dev, size, dma_handle, flag, attrs);
-	return memory;
-}
-
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h
index 16965427f6b4..c20d3caa7dad 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -70,37 +70,4 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return (dma_addr == bad_dma_address);
 }
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flag,
-				    struct dma_attrs *attrs)
-{
-	void *ret;
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!dma_ops);
-
-	ret = ops->alloc(dev, size, dma_handle, flag, attrs);
-
-	debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-	return ret;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-	BUG_ON(!dma_ops);
-
-	dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-}
-
 #endif
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index cf3ab7e784b5..d36f83cc226a 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -23,31 +23,6 @@ extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t,
 extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
 				enum dma_data_direction);
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *daddr, gfp_t gfp,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	void *caddr;
-
-	caddr = ops->alloc(dev, size, daddr, gfp, attrs);
-	debug_dma_alloc_coherent(dev, size, *daddr, caddr);
-	return caddr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *caddr, dma_addr_t daddr,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	debug_dma_free_coherent(dev, size, caddr, daddr);
-	ops->free(dev, size, caddr, daddr, attrs);
-}
-
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h
index ab353723076a..801dbe215a8c 100644
--- a/arch/microblaze/include/asm/dma-mapping.h
+++ b/arch/microblaze/include/asm/dma-mapping.h
@@ -27,7 +27,6 @@
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
-#include <asm-generic/dma-coherent.h>
 #include <asm/cacheflush.h>
 
 #define DMA_ERROR_CODE		(~(dma_addr_t)0x0)
@@ -102,36 +101,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flag,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *memory;
-
-	BUG_ON(!ops);
-
-	memory = ops->alloc(dev, size, dma_handle, flag, attrs);
-
-	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-	return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!ops);
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		enum dma_data_direction direction)
 {
diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c
index d8960d46417b..2cd45f5f9481 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -161,9 +161,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
 {
 	void *ret;
 
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-		return ret;
-
 	/* ignore region specifiers */
 	gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -194,11 +191,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
 static void octeon_dma_free_coherent(struct device *dev, size_t size,
 	void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-	int order = get_order(size);
-
-	if (dma_release_from_coherent(dev, order, vaddr))
-		return;
-
 	swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index 360b3387182a..b197595134ba 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -4,7 +4,6 @@
 #include <linux/scatterlist.h>
 #include <asm/dma-coherence.h>
 #include <asm/cache.h>
-#include <asm-generic/dma-coherent.h>
 
 #ifndef CONFIG_SGI_IP27 /* Kludge to fix 2.6.39 build for IP27 */
 #include <dma-coherence.h>
@@ -65,36 +64,6 @@ dma_set_mask(struct device *dev, u64 mask)
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction);
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t gfp,
-				    struct dma_attrs *attrs)
-{
-	void *ret;
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
-
-	debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-	return ret;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *vaddr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	ops->free(dev, size, vaddr, dma_handle, attrs);
-
-	debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-}
-
-
 void *dma_alloc_noncoherent(struct device *dev, size_t size,
 			   dma_addr_t *dma_handle, gfp_t flag);
 
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c b/arch/mips/loongson64/common/dma-swiotlb.c
index 2c6b989c1bc4..ef9da3b5c543 100644
--- a/arch/mips/loongson64/common/dma-swiotlb.c
+++ b/arch/mips/loongson64/common/dma-swiotlb.c
@@ -14,9 +14,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
 {
 	void *ret;
 
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-		return ret;
-
 	/* ignore region specifiers */
 	gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -46,11 +43,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
 static void loongson_dma_free_coherent(struct device *dev, size_t size,
 		void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-	int order = get_order(size);
-
-	if (dma_release_from_coherent(dev, order, vaddr))
-		return;
-
 	swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index 8f23cf08f4ba..6c0fd13fa8e8 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -137,9 +137,6 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
 	struct page *page = NULL;
 	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-		return ret;
-
 	gfp = massage_gfp_flags(dev, gfp);
 
 	if (IS_ENABLED(CONFIG_DMA_CMA) && !(gfp & GFP_ATOMIC))
@@ -176,13 +173,9 @@ static void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr,
 	dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
 	unsigned long addr = (unsigned long) vaddr;
-	int order = get_order(size);
 	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	struct page *page = NULL;
 
-	if (dma_release_from_coherent(dev, order, vaddr))
-		return;
-
 	plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
 
 	if (!plat_device_is_coherent(dev) && !hw_coherentio)
diff --git a/arch/mips/netlogic/common/nlm-dma.c b/arch/mips/netlogic/common/nlm-dma.c
index f3d4ae87abc7..3758715d4ab6 100644
--- a/arch/mips/netlogic/common/nlm-dma.c
+++ b/arch/mips/netlogic/common/nlm-dma.c
@@ -47,11 +47,6 @@ static char *nlm_swiotlb;
 static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
 	dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-	void *ret;
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-		return ret;
-
 	/* ignore region specifiers */
 	gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -69,11 +64,6 @@ static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
 static void nlm_dma_free_coherent(struct device *dev, size_t size,
 	void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-	int order = get_order(size);
-
-	if (dma_release_from_coherent(dev, order, vaddr))
-		return;
-
 	swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index fab8628e1b6e..a81d6f68e9c8 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -23,7 +23,6 @@
  */
 
 #include <linux/dma-debug.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/kmemcheck.h>
 #include <linux/dma-mapping.h>
 
@@ -38,35 +37,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) 
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t gfp,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *memory;
-
-	memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
-
-	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-	return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
 					  dma_addr_t *dma_handle, gfp_t gfp)
 {
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 710f60e380e0..e6ca63ac4c6c 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -137,39 +137,6 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask);
 extern int __dma_set_mask(struct device *dev, u64 dma_mask);
 extern u64 __dma_get_required_mask(struct device *dev);
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flag,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
-	void *cpu_addr;
-
-	BUG_ON(!dma_ops);
-
-	cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-
-	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-	return cpu_addr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-	BUG_ON(!dma_ops);
-
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-	dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	struct dma_map_ops *dma_ops = get_dma_ops(dev);
diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index 9d395961e713..c29c9c7d81e8 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -56,35 +56,4 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return dma_addr == DMA_ERROR_CODE;
 }
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flags,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *cpu_addr;
-
-	BUG_ON(!ops);
-
-	cpu_addr = ops->alloc(dev, size, dma_handle, flags, attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-	return cpu_addr;
-}
-
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!ops);
-
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 #endif /* _ASM_S390_DMA_MAPPING_H */
diff --git a/arch/sh/include/asm/dma-mapping.h b/arch/sh/include/asm/dma-mapping.h
index b437f2c780b8..3c78059e66ff 100644
--- a/arch/sh/include/asm/dma-mapping.h
+++ b/arch/sh/include/asm/dma-mapping.h
@@ -9,7 +9,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return dma_ops;
 }
 
-#include <asm-generic/dma-coherent.h>
 #include <asm-generic/dma-mapping-common.h>
 
 static inline int dma_supported(struct device *dev, u64 mask)
@@ -53,42 +52,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return dma_addr == 0;
 }
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t gfp,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *memory;
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-		return memory;
-	if (!ops->alloc)
-		return NULL;
-
-	memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-	return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *vaddr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (dma_release_from_coherent(dev, get_order(size), vaddr))
-		return;
-
-	debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-	if (ops->free)
-		ops->free(dev, size, vaddr, dma_handle, attrs);
-}
-
 /* arch/sh/mm/consistent.c */
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 					dma_addr_t *dma_addr, gfp_t flag,
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 7e064c68c5ec..a8c678494ce7 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -41,32 +41,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flag,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *cpu_addr;
-
-	cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-	return cpu_addr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	debug_dma_mapping_error(dev, dma_addr);
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 1eae359d8315..4aba10e49310 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -116,34 +116,7 @@ dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flag,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
-	void *cpu_addr;
-
-	cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-
-	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-	return cpu_addr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-	dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
 #define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
 
 /*
diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
index 366460a81796..5294d03e59de 100644
--- a/arch/unicore32/include/asm/dma-mapping.h
+++ b/arch/unicore32/include/asm/dma-mapping.h
@@ -18,8 +18,6 @@
 #include <linux/scatterlist.h>
 #include <linux/swiotlb.h>
 
-#include <asm-generic/dma-coherent.h>
-
 #include <asm/memory.h>
 #include <asm/cacheflush.h>
 
@@ -82,28 +80,6 @@ static inline int dma_set_mask(struct device *dev, u64 dma_mask)
 	return 0;
 }
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flag,
-				    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-	return dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-	dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1f5b7287d1ad..f9b1b6cc48b6 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,7 +12,6 @@
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
 #include <asm/swiotlb.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/dma-contiguous.h>
 
 #ifdef CONFIG_ISA
@@ -41,6 +40,9 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 #endif
 }
 
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+#define arch_dma_alloc_attrs arch_dma_alloc_attrs
+
 #include <asm-generic/dma-mapping-common.h>
 
 /* Make sure we keep the same behaviour */
@@ -125,16 +127,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
        return gfp;
 }
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-void *
-dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t gfp, struct dma_attrs *attrs);
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-void dma_free_attrs(struct device *dev, size_t size,
-		    void *vaddr, dma_addr_t bus,
-		    struct dma_attrs *attrs);
-
 #endif
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 353972c1946c..bd23971e8f1d 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -140,50 +140,19 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
 		free_pages((unsigned long)vaddr, get_order(size));
 }
 
-void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		      gfp_t gfp, struct dma_attrs *attrs)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *memory;
-
-	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-		return memory;
-
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
-	if (!is_device_dma_capable(dev))
-		return NULL;
-
-	if (!ops->alloc)
-		return NULL;
-
-	memory = ops->alloc(dev, size, dma_handle,
-			    dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-	return memory;
-}
-EXPORT_SYMBOL(dma_alloc_attrs);
-
-void dma_free_attrs(struct device *dev, size_t size,
-		    void *vaddr, dma_addr_t bus,
-		    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	WARN_ON(irqs_disabled());       /* for portability */
+	*gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
+	*gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
 
-	if (dma_release_from_coherent(dev, get_order(size), vaddr))
-		return;
+	if (!*dev)
+		*dev = &x86_dma_fallback_dev;
+	if (!is_device_dma_capable(*dev))
+		return false;
+	return true;
 
-	debug_dma_free_coherent(dev, size, vaddr, bus);
-	if (ops->free)
-		ops->free(dev, size, vaddr, bus, attrs);
 }
-EXPORT_SYMBOL(dma_free_attrs);
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
 
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index f01cb3044e50..bf24c908e5ff 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -34,37 +34,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
 #define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t gfp,
-				    struct dma_attrs *attrs)
-{
-	void *ret;
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-		return ret;
-
-	ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-	return ret;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *vaddr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (dma_release_from_coherent(dev, get_order(size), vaddr))
-		return;
-
-	ops->free(dev, size, vaddr, dma_handle, attrs);
-	debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-}
 
 static inline int
 dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 4c549323c605..da1029ef8159 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -311,9 +311,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	*/
 	flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
 
-	if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
-		return ret;
-
 	/* On ARM this function returns an ioremap'ped virtual address for
 	 * which virt_to_phys doesn't return the corresponding physical
 	 * address. In fact on ARM virt_to_phys only works for kernel direct
@@ -356,9 +353,6 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 	phys_addr_t phys;
 	u64 dma_mask = DMA_BIT_MASK(32);
 
-	if (dma_release_from_coherent(hwdev, order, vaddr))
-		return;
-
 	if (hwdev && hwdev->coherent_dma_mask)
 		dma_mask = hwdev->coherent_dma_mask;
 
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
index 940d5ec122c9..56dd9ea2bc8c 100644
--- a/include/asm-generic/dma-mapping-common.h
+++ b/include/asm-generic/dma-mapping-common.h
@@ -6,6 +6,7 @@
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
+#include <asm-generic/dma-coherent.h>
 
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 					      size_t size,
@@ -237,4 +238,61 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 
 #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL)
 
+#ifndef arch_dma_alloc_attrs
+#define arch_dma_alloc_attrs(dev, flag)	(true)
+#endif
+
+static inline void *dma_alloc_attrs(struct device *dev, size_t size,
+				       dma_addr_t *dma_handle, gfp_t flag,
+				       struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	void *cpu_addr;
+
+	BUG_ON(!ops);
+
+	if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr))
+		return cpu_addr;
+
+	if (!arch_dma_alloc_attrs(&dev, &flag))
+		return NULL;
+	if (!ops->alloc)
+		return NULL;
+
+	cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+	return cpu_addr;
+}
+
+static inline void dma_free_attrs(struct device *dev, size_t size,
+				     void *cpu_addr, dma_addr_t dma_handle,
+				     struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!ops);
+	WARN_ON(irqs_disabled());
+
+	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
+		return;
+
+	if (!ops->free)
+		return;
+
+	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+	ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+
+static inline void *dma_alloc_coherent(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t flag)
+{
+	return dma_alloc_attrs(dev, size, dma_handle, flag, NULL);
+}
+
+static inline void dma_free_coherent(struct device *dev, size_t size,
+		void *cpu_addr, dma_addr_t dma_handle)
+{
+	return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL);
+}
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 452e06af1f0149b01201f94264d452cd7a95db7a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 Sep 2015 15:39:53 -0700
Subject: dma-mapping: consolidate dma_set_mask

Almost everyone implements dma_set_mask the same way, although some time
that's hidden in ->set_dma_mask methods.

This patch consolidates those into a common implementation that either
calls ->set_dma_mask if present or otherwise uses the default
implementation.  Some architectures used to only call ->set_dma_mask
after the initial checks, and those instance have been fixed to do the
full work.  h8300 implemented dma_set_mask bogusly as a no-ops and has
been fixed.

Unfortunately some architectures overload unrelated semantics like changing
the dma_ops into it so we still need to allow for an architecture override
for now.

[jcmvbkbc@gmail.com: fix xtensa]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/dma-mapping.h      |  5 -----
 arch/alpha/kernel/pci-noop.c              | 10 ----------
 arch/alpha/kernel/pci_iommu.c             | 11 -----------
 arch/arm/include/asm/dma-mapping.h        |  5 -----
 arch/arm64/include/asm/dma-mapping.h      |  9 ---------
 arch/h8300/include/asm/dma-mapping.h      |  5 -----
 arch/hexagon/include/asm/dma-mapping.h    |  1 -
 arch/hexagon/kernel/dma.c                 | 11 -----------
 arch/ia64/include/asm/dma-mapping.h       |  9 ---------
 arch/microblaze/include/asm/dma-mapping.h | 14 --------------
 arch/mips/include/asm/dma-mapping.h       | 16 ----------------
 arch/mips/loongson64/common/dma-swiotlb.c |  3 +++
 arch/openrisc/include/asm/dma-mapping.h   |  9 ---------
 arch/powerpc/include/asm/dma-mapping.h    |  4 +++-
 arch/s390/include/asm/dma-mapping.h       |  2 --
 arch/s390/pci/pci_dma.c                   | 10 ----------
 arch/sh/include/asm/dma-mapping.h         | 14 --------------
 arch/sparc/include/asm/dma-mapping.h      |  4 +++-
 arch/tile/include/asm/dma-mapping.h       |  6 ++++--
 arch/unicore32/include/asm/dma-mapping.h  | 10 ----------
 arch/x86/include/asm/dma-mapping.h        |  2 --
 arch/x86/kernel/pci-dma.c                 | 11 -----------
 arch/xtensa/include/asm/dma-mapping.h     | 11 -----------
 include/asm-generic/dma-mapping-common.h  | 15 +++++++++++++++
 24 files changed, 28 insertions(+), 169 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index 9d763e535c5a..72a8ca7796d9 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -12,11 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-	return get_dma_ops(dev)->set_dma_mask(dev, mask);
-}
-
 #define dma_cache_sync(dev, va, size, dir)		  ((void)0)
 
 #endif	/* _ALPHA_DMA_MAPPING_H */
diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
index df24b76f9246..2b1f4a1e9272 100644
--- a/arch/alpha/kernel/pci-noop.c
+++ b/arch/alpha/kernel/pci-noop.c
@@ -166,15 +166,6 @@ static int alpha_noop_supported(struct device *dev, u64 mask)
 	return mask < 0x00ffffffUL ? 0 : 1;
 }
 
-static int alpha_noop_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-
-	*dev->dma_mask = mask;
-	return 0;
-}
-
 struct dma_map_ops alpha_noop_ops = {
 	.alloc			= alpha_noop_alloc_coherent,
 	.free			= alpha_noop_free_coherent,
@@ -182,7 +173,6 @@ struct dma_map_ops alpha_noop_ops = {
 	.map_sg			= alpha_noop_map_sg,
 	.mapping_error		= alpha_noop_mapping_error,
 	.dma_supported		= alpha_noop_supported,
-	.set_dma_mask		= alpha_noop_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_noop_ops;
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index eddee7720343..8969bf2dfe3a 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -939,16 +939,6 @@ static int alpha_pci_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return dma_addr == 0;
 }
 
-static int alpha_pci_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask ||
-	    !pci_dma_supported(alpha_gendev_to_pci(dev), mask))
-		return -EIO;
-
-	*dev->dma_mask = mask;
-	return 0;
-}
-
 struct dma_map_ops alpha_pci_ops = {
 	.alloc			= alpha_pci_alloc_coherent,
 	.free			= alpha_pci_free_coherent,
@@ -958,7 +948,6 @@ struct dma_map_ops alpha_pci_ops = {
 	.unmap_sg		= alpha_pci_unmap_sg,
 	.mapping_error		= alpha_pci_mapping_error,
 	.dma_supported		= alpha_pci_supported,
-	.set_dma_mask		= alpha_pci_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_pci_ops;
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 2f9c731691c0..ccb3aa64640d 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -48,11 +48,6 @@ extern int dma_supported(struct device *dev, u64 mask);
  */
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-	return get_dma_ops(dev)->set_dma_mask(dev, mask);
-}
-
 #ifdef __arch_page_to_dma
 #error Please update to __arch_pfn_to_dma
 #endif
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index f519a58c55ae..cfdb34bedbcd 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -84,15 +84,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
 	return (phys_addr_t)dev_addr;
 }
 
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-	*dev->dma_mask = mask;
-
-	return 0;
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h
index 48d652eb1b5f..d9b5b806afe6 100644
--- a/arch/h8300/include/asm/dma-mapping.h
+++ b/arch/h8300/include/asm/dma-mapping.h
@@ -10,9 +10,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-	return 0;
-}
-
 #endif
diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h
index 36e8de710b32..268fde8a4575 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -45,7 +45,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *dev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
 extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle);
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 			   enum dma_data_direction direction);
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index b74f9bae31a3..9e3ddf792bd3 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -44,17 +44,6 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-
-	*dev->dma_mask = mask;
-
-	return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 static struct gen_pool *coherent_pool;
 
 
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 7982caa7c5e7..9beccf8010bd 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -27,15 +27,6 @@ extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int
-dma_set_mask (struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-	*dev->dma_mask = mask;
-	return 0;
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h
index 3b453c503a43..24b12970c9cf 100644
--- a/arch/microblaze/include/asm/dma-mapping.h
+++ b/arch/microblaze/include/asm/dma-mapping.h
@@ -46,20 +46,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (unlikely(ops == NULL))
-		return -EIO;
-	if (ops->set_dma_mask)
-		return ops->set_dma_mask(dev, dma_mask);
-	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-		return -EIO;
-	*dev->dma_mask = dma_mask;
-	return 0;
-}
-
 static inline void __dma_sync(unsigned long paddr,
 			      size_t size, enum dma_data_direction direction)
 {
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index 8bf8ec30a4b2..e604f760c4a0 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -31,22 +31,6 @@ static inline void dma_mark_clean(void *addr, size_t size) {}
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int
-dma_set_mask(struct device *dev, u64 mask)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if(!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-
-	if (ops->set_dma_mask)
-		return ops->set_dma_mask(dev, mask);
-
-	*dev->dma_mask = mask;
-
-	return 0;
-}
-
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction);
 
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c b/arch/mips/loongson64/common/dma-swiotlb.c
index ef9da3b5c543..4ffa6fc81c8f 100644
--- a/arch/mips/loongson64/common/dma-swiotlb.c
+++ b/arch/mips/loongson64/common/dma-swiotlb.c
@@ -85,6 +85,9 @@ static void loongson_dma_sync_sg_for_device(struct device *dev,
 
 static int loongson_dma_set_mask(struct device *dev, u64 mask)
 {
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+
 	if (mask > DMA_BIT_MASK(loongson_sysconf.dma_mask_bits)) {
 		*dev->dma_mask = DMA_BIT_MASK(loongson_sysconf.dma_mask_bits);
 		return -EIO;
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index 8fc08b883477..413bfcf86384 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -44,13 +44,4 @@ static inline int dma_supported(struct device *dev, u64 dma_mask)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-		return -EIO;
-
-	*dev->dma_mask = dma_mask;
-
-	return 0;
-}
 #endif	/* __ASM_OPENRISC_DMA_MAPPING_H */
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index dd43e0c6f219..7f522c021dc3 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -122,9 +122,11 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
 /* this will be removed soon */
 #define flush_write_buffers()
 
+#define HAVE_ARCH_DMA_SET_MASK 1
+extern int dma_set_mask(struct device *dev, u64 dma_mask);
+
 #include <asm-generic/dma-mapping-common.h>
 
-extern int dma_set_mask(struct device *dev, u64 dma_mask);
 extern int __dma_set_mask(struct device *dev, u64 dma_mask);
 extern u64 __dma_get_required_mask(struct device *dev);
 
diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index 1f42489797da..b3fd54d93dd2 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -18,8 +18,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return &s390_dma_ops;
 }
 
-extern int dma_set_mask(struct device *dev, u64 mask);
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 				  enum dma_data_direction direction)
 {
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 42b76580c8b8..37505b8b4093 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -262,16 +262,6 @@ out:
 	spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
 }
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-
-	*dev->dma_mask = mask;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(dma_set_mask);
-
 static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
 				     unsigned long offset, size_t size,
 				     enum dma_data_direction direction,
diff --git a/arch/sh/include/asm/dma-mapping.h b/arch/sh/include/asm/dma-mapping.h
index 088f6e5f1a92..a3745a3fe029 100644
--- a/arch/sh/include/asm/dma-mapping.h
+++ b/arch/sh/include/asm/dma-mapping.h
@@ -13,20 +13,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-	if (ops->set_dma_mask)
-		return ops->set_dma_mask(dev, mask);
-
-	*dev->dma_mask = mask;
-
-	return 0;
-}
-
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		    enum dma_data_direction dir);
 
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 184651bb0b46..a21da597b0b5 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -37,7 +37,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
+#define HAVE_ARCH_DMA_SET_MASK 1
 
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
@@ -52,4 +52,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
 	return -EINVAL;
 }
 
+#include <asm-generic/dma-mapping-common.h>
+
 #endif
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 559ed4a60077..96ac6cce4a32 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -59,8 +59,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
 {
 	dev->archdata.dma_ops = ops;
@@ -74,6 +72,10 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 	return addr + size - 1 <= *dev->dma_mask;
 }
 
+#define HAVE_ARCH_DMA_SET_MASK 1
+
+#include <asm-generic/dma-mapping-common.h>
+
 static inline int
 dma_set_mask(struct device *dev, u64 mask)
 {
diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
index 21231c14182c..8140e053ccd3 100644
--- a/arch/unicore32/include/asm/dma-mapping.h
+++ b/arch/unicore32/include/asm/dma-mapping.h
@@ -50,16 +50,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-		return -EIO;
-
-	*dev->dma_mask = dma_mask;
-
-	return 0;
-}
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr,
 		size_t size, enum dma_data_direction direction)
 {
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index b1fbf582048b..953b7263f844 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -48,8 +48,6 @@ extern int dma_supported(struct device *hwdev, u64 mask);
 
 #include <asm-generic/dma-mapping-common.h>
 
-extern int dma_set_mask(struct device *dev, u64 mask);
-
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 					dma_addr_t *dma_addr, gfp_t flag,
 					struct dma_attrs *attrs);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index bd23971e8f1d..84b8ef82a159 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev);
 /* Number of entries preallocated for DMA-API debugging */
 #define PREALLOC_DMA_DEBUG_ENTRIES       65536
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-
-	*dev->dma_mask = mask;
-
-	return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 void __init pci_iommu_alloc(void)
 {
 	struct iommu_table_entry *p;
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index 329abc7211e9..4427f38b634e 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -32,17 +32,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int
-dma_set_mask(struct device *dev, u64 mask)
-{
-	if(!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-
-	*dev->dma_mask = mask;
-
-	return 0;
-}
-
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		    enum dma_data_direction direction);
 
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
index 67fa6bcd644c..b1bc954eccf3 100644
--- a/include/asm-generic/dma-mapping-common.h
+++ b/include/asm-generic/dma-mapping-common.h
@@ -340,4 +340,19 @@ static inline int dma_supported(struct device *dev, u64 mask)
 }
 #endif
 
+#ifndef HAVE_ARCH_DMA_SET_MASK
+static inline int dma_set_mask(struct device *dev, u64 mask)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (ops->set_dma_mask)
+		return ops->set_dma_mask(dev, mask);
+
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+	*dev->dma_mask = mask;
+	return 0;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3-70-g09d2