From e8b9b0cc8269c85d8167aaee024bfcbb4976c031 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sat, 14 Oct 2017 09:59:49 -0700
Subject: x86/mm/64: Remove the last VM_BUG_ON() from the TLB code

Let's avoid hard-to-diagnose crashes in the future.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f423bbc97864089fbdeb813f1ea126c6eaed844a.1508000261.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/tlb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm/tlb.c')

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 658bf0090565..7db23f9f804e 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -147,8 +147,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	this_cpu_write(cpu_tlbstate.is_lazy, false);
 
 	if (real_prev == next) {
-		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
-			  next->context.ctx_id);
+		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+			   next->context.ctx_id);
 
 		/*
 		 * We don't currently support having a real mm loaded without
-- 
cgit v1.2.3-70-g09d2


From 4e57b94664fef55aa71cac33b4632fdfdd52b695 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sat, 14 Oct 2017 09:59:50 -0700
Subject: x86/mm: Tidy up "x86/mm: Flush more aggressively in lazy TLB mode"

Due to timezones, commit:

  b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode")

was an outdated patch that well tested and fixed the bug but didn't
address Borislav's review comments.

Tidy it up:

 - The name "tlb_use_lazy_mode()" was highly confusing.  Change it to
   "tlb_defer_switch_to_init_mm()", which describes what it actually
   means.

 - Move the static_branch crap into a helper.

 - Improve comments.

Actually removing the debugfs option is in the next patch.

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode")
Link: http://lkml.kernel.org/r/154ef95428d4592596b6e98b0af1d2747d6cfbf8.1508000261.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h |  7 ++++++-
 arch/x86/mm/tlb.c               | 30 ++++++++++++++++++------------
 2 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'arch/x86/mm/tlb.c')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d362161d3291..0d4a1bb7e303 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -87,7 +87,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
  * to init_mm when we switch to a kernel thread (e.g. the idle thread).  If
  * it's false, then we immediately switch CR3 when entering a kernel thread.
  */
-DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+DECLARE_STATIC_KEY_TRUE(__tlb_defer_switch_to_init_mm);
+
+static inline bool tlb_defer_switch_to_init_mm(void)
+{
+	return static_branch_unlikely(&__tlb_defer_switch_to_init_mm);
+}
 
 /*
  * 6 because 6 should be plenty and struct tlb_state will fit in
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 7db23f9f804e..5ee3b59baa85 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,7 +30,7 @@
 
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
-DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+DEFINE_STATIC_KEY_TRUE(__tlb_defer_switch_to_init_mm);
 
 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 			    u16 *new_asid, bool *need_flush)
@@ -213,6 +213,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 }
 
 /*
+ * Please ignore the name of this function.  It should be called
+ * switch_to_kernel_thread().
+ *
  * enter_lazy_tlb() is a hint from the scheduler that we are entering a
  * kernel thread or other context without an mm.  Acceptable implementations
  * include doing nothing whatsoever, switching to init_mm, or various clever
@@ -227,7 +230,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
 		return;
 
-	if (static_branch_unlikely(&tlb_use_lazy_mode)) {
+	if (tlb_defer_switch_to_init_mm()) {
 		/*
 		 * There's a significant optimization that may be possible
 		 * here.  We have accurate enough TLB flush tracking that we
@@ -632,7 +635,8 @@ static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
 {
 	char buf[2];
 
-	buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
+	buf[0] = static_branch_likely(&__tlb_defer_switch_to_init_mm)
+		? '1' : '0';
 	buf[1] = '\n';
 
 	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
@@ -647,9 +651,9 @@ static ssize_t tlblazy_write_file(struct file *file,
 		return -EINVAL;
 
 	if (val)
-		static_branch_enable(&tlb_use_lazy_mode);
+		static_branch_enable(&__tlb_defer_switch_to_init_mm);
 	else
-		static_branch_disable(&tlb_use_lazy_mode);
+		static_branch_disable(&__tlb_defer_switch_to_init_mm);
 
 	return count;
 }
@@ -660,23 +664,25 @@ static const struct file_operations fops_tlblazy = {
 	.llseek = default_llseek,
 };
 
-static int __init init_tlb_use_lazy_mode(void)
+static int __init init_tlblazy(void)
 {
 	if (boot_cpu_has(X86_FEATURE_PCID)) {
 		/*
-		 * Heuristic: with PCID on, switching to and from
-		 * init_mm is reasonably fast, but remote flush IPIs
-		 * as expensive as ever, so turn off lazy TLB mode.
+		 * If we have PCID, then switching to init_mm is reasonably
+		 * fast.  If we don't have PCID, then switching to init_mm is
+		 * quite slow, so we default to trying to defer it in the
+		 * hopes that we can avoid it entirely.  The latter approach
+		 * runs the risk of receiving otherwise unnecessary IPIs.
 		 *
 		 * We can't do this in setup_pcid() because static keys
 		 * haven't been initialized yet, and it would blow up
 		 * badly.
 		 */
-		static_branch_disable(&tlb_use_lazy_mode);
+		static_branch_disable(&__tlb_defer_switch_to_init_mm);
 	}
 
-	debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
+	debugfs_create_file("tlb_defer_switch_to_init_mm", S_IRUSR | S_IWUSR,
 			    arch_debugfs_dir, NULL, &fops_tlblazy);
 	return 0;
 }
-late_initcall(init_tlb_use_lazy_mode);
+late_initcall(init_tlblazy);
-- 
cgit v1.2.3-70-g09d2


From 7ac7f2c315ef76437f5119df354d334448534fb5 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sat, 14 Oct 2017 09:59:51 -0700
Subject: x86/mm: Remove debug/x86/tlb_defer_switch_to_init_mm

Borislav thinks that we don't need this knob in a released kernel.
Get rid of it.

Requested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode")
Link: http://lkml.kernel.org/r/1fa72431924e81e86c164ff7881bf9240d1f1a6c.1508000261.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 20 ++++++++------
 arch/x86/mm/tlb.c               | 58 -----------------------------------------
 2 files changed, 12 insertions(+), 66 deletions(-)

(limited to 'arch/x86/mm/tlb.c')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0d4a1bb7e303..c4aed0de565e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,16 +82,20 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
-/*
- * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point
- * to init_mm when we switch to a kernel thread (e.g. the idle thread).  If
- * it's false, then we immediately switch CR3 when entering a kernel thread.
- */
-DECLARE_STATIC_KEY_TRUE(__tlb_defer_switch_to_init_mm);
-
 static inline bool tlb_defer_switch_to_init_mm(void)
 {
-	return static_branch_unlikely(&__tlb_defer_switch_to_init_mm);
+	/*
+	 * If we have PCID, then switching to init_mm is reasonably
+	 * fast.  If we don't have PCID, then switching to init_mm is
+	 * quite slow, so we try to defer it in the hopes that we can
+	 * avoid it entirely.  The latter approach runs the risk of
+	 * receiving otherwise unnecessary IPIs.
+	 *
+	 * This choice is just a heuristic.  The tlb code can handle this
+	 * function returning true or false regardless of whether we have
+	 * PCID.
+	 */
+	return !static_cpu_has(X86_FEATURE_PCID);
 }
 
 /*
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5ee3b59baa85..0f3d0cea4d00 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,7 +30,6 @@
 
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
-DEFINE_STATIC_KEY_TRUE(__tlb_defer_switch_to_init_mm);
 
 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 			    u16 *new_asid, bool *need_flush)
@@ -629,60 +628,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
 	return 0;
 }
 late_initcall(create_tlb_single_page_flush_ceiling);
-
-static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
-				 size_t count, loff_t *ppos)
-{
-	char buf[2];
-
-	buf[0] = static_branch_likely(&__tlb_defer_switch_to_init_mm)
-		? '1' : '0';
-	buf[1] = '\n';
-
-	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
-}
-
-static ssize_t tlblazy_write_file(struct file *file,
-		 const char __user *user_buf, size_t count, loff_t *ppos)
-{
-	bool val;
-
-	if (kstrtobool_from_user(user_buf, count, &val))
-		return -EINVAL;
-
-	if (val)
-		static_branch_enable(&__tlb_defer_switch_to_init_mm);
-	else
-		static_branch_disable(&__tlb_defer_switch_to_init_mm);
-
-	return count;
-}
-
-static const struct file_operations fops_tlblazy = {
-	.read = tlblazy_read_file,
-	.write = tlblazy_write_file,
-	.llseek = default_llseek,
-};
-
-static int __init init_tlblazy(void)
-{
-	if (boot_cpu_has(X86_FEATURE_PCID)) {
-		/*
-		 * If we have PCID, then switching to init_mm is reasonably
-		 * fast.  If we don't have PCID, then switching to init_mm is
-		 * quite slow, so we default to trying to defer it in the
-		 * hopes that we can avoid it entirely.  The latter approach
-		 * runs the risk of receiving otherwise unnecessary IPIs.
-		 *
-		 * We can't do this in setup_pcid() because static keys
-		 * haven't been initialized yet, and it would blow up
-		 * badly.
-		 */
-		static_branch_disable(&__tlb_defer_switch_to_init_mm);
-	}
-
-	debugfs_create_file("tlb_defer_switch_to_init_mm", S_IRUSR | S_IWUSR,
-			    arch_debugfs_dir, NULL, &fops_tlblazy);
-	return 0;
-}
-late_initcall(init_tlblazy);
-- 
cgit v1.2.3-70-g09d2


From 675357362aeba19688440eb1aaa7991067f73b12 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sat, 4 Nov 2017 04:16:12 -0700
Subject: Revert "x86/mm: Stop calling leave_mm() in idle code"

This reverts commit 43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5.

The reason I removed the leave_mm() calls in question is because the
heuristic wasn't needed after that patch.  With the original version
of my PCID series, we never flushed a "lazy cpu" (i.e. a CPU running
kernel thread) due a flush on the loaded mm.

Unfortunately, that caused architectural issues, so now I've
reinstated these flushes on non-PCID systems in:

    commit b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode").

That, in turn, gives us a power management and occasionally
performance regression as compared to old kernels: a process that
goes into a deep idle state on a given CPU and gets its mm flushed
due to activity on a different CPU will wake the idle CPU.

Reinstate the old ugly heuristic: if a CPU goes into ACPI C3 or an
intel_idle state that is likely to cause a TLB flush gets its mm
switched to init_mm before going idle.

FWIW, this heuristic is lousy.  Whether we should change CR3 before
idle isn't a good hint except insofar as the performance hit is a bit
lower if the TLB is getting flushed by the idle code anyway.  What we
really want to know is whether we anticipate being idle long enough
that the mm is likely to be flushed before we wake up.  This is more a
matter of the expected latency than the idle state that gets chosen.
This heuristic also completely fails on systems that don't know
whether the TLB will be flushed (e.g. AMD systems?).  OTOH it may be a
bit obsolete anyway -- PCID systems don't presently benefit from this
heuristic at all.

We also shouldn't do this callback from innermost bit of the idle code
due to the RCU nastiness it causes.  All the information need is
available before rcu_idle_enter() needs to happen.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 43858b4f25cf "x86/mm: Stop calling leave_mm() in idle code"
Link: http://lkml.kernel.org/r/c513bbd4e653747213e05bc7062de000bf0202a5.1509793738.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/include/asm/acpi.h  |  2 ++
 arch/x86/include/asm/acpi.h   |  2 ++
 arch/x86/mm/tlb.c             | 17 ++++++++++++++---
 drivers/acpi/processor_idle.c |  2 ++
 drivers/idle/intel_idle.c     |  9 +++++----
 5 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'arch/x86/mm/tlb.c')

diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index c86a947f5368..a3d0211970e9 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -112,6 +112,8 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
 	buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
 }
 
+#define acpi_unlazy_tlb(x)
+
 #ifdef CONFIG_ACPI_NUMA
 extern cpumask_t early_cpu_possible_map;
 #define for_each_possible_early_cpu(cpu)  \
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 72d867f6b518..8d0ec9df1cbe 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -150,6 +150,8 @@ static inline void disable_acpi(void) { }
 extern int x86_acpi_numa_init(void);
 #endif /* CONFIG_ACPI_NUMA */
 
+#define acpi_unlazy_tlb(x)	leave_mm(x)
+
 #ifdef CONFIG_ACPI_APEI
 static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
 {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0f3d0cea4d00..3118392cdf75 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -85,6 +85,7 @@ void leave_mm(int cpu)
 
 	switch_mm(NULL, &init_mm, NULL);
 }
+EXPORT_SYMBOL_GPL(leave_mm);
 
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	       struct task_struct *tsk)
@@ -195,12 +196,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
 			write_cr3(build_cr3(next, new_asid));
-			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
-					TLB_FLUSH_ALL);
+
+			/*
+			 * NB: This gets called via leave_mm() in the idle path
+			 * where RCU functions differently.  Tracing normally
+			 * uses RCU, so we need to use the _rcuidle variant.
+			 *
+			 * (There is no good reason for this.  The idle code should
+			 *  be rearranged to call this before rcu_idle_enter().)
+			 */
+			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 		} else {
 			/* The new ASID is already up to date. */
 			write_cr3(build_cr3_noflush(next, new_asid));
-			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+
+			/* See above wrt _rcuidle. */
+			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
 		}
 
 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 2736e25e9dc6..d50a7b6ccddd 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -710,6 +710,8 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
 static void acpi_idle_enter_bm(struct acpi_processor *pr,
 			       struct acpi_processor_cx *cx, bool timer_bc)
 {
+	acpi_unlazy_tlb(smp_processor_id());
+
 	/*
 	 * Must be done before busmaster disable as we might need to
 	 * access HPET !
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 5dc7ea4b6bc4..f0b06b14e782 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -913,15 +913,16 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	struct cpuidle_state *state = &drv->states[index];
 	unsigned long eax = flg2MWAIT(state->flags);
 	unsigned int cstate;
+	int cpu = smp_processor_id();
 
 	cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
 
 	/*
-	 * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition
-	 * will probably flush the TLB.  It's not guaranteed to flush
-	 * the TLB, though, so it's not clear that we can do anything
-	 * useful with this knowledge.
+	 * leave_mm() to avoid costly and often unnecessary wakeups
+	 * for flushing the user TLB's associated with the active mm.
 	 */
+	if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
+		leave_mm(cpu);
 
 	if (!(lapic_timer_reliable_states & (1 << (cstate))))
 		tick_broadcast_enter();
-- 
cgit v1.2.3-70-g09d2