diff options
| author | Ingo Molnar <mingo@kernel.org> | 2015-03-31 10:47:18 +0200 | 
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2015-03-31 10:47:18 +0200 | 
| commit | f5c8a104116a56503b6e824e7782b2e805b29abb (patch) | |
| tree | 3da03b9d29bc15d06f19348f34a72d29f5a2ab49 | |
| parent | c9ce8712838e48bf356144122c5ecdcdac5d1829 (diff) | |
| parent | 43eaa2a1ad70d72876cdbb2eb5450a2665e4770f (diff) | |
Merge tag 'amd_severity' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/ras
Pull RAS update from Borislav Petkov:
  "This has been long in the making - an AMD-specific MCE-severity grading
   function. And it is actually readable at a quick glance. Further error
   recovery actions will be based on its output.
   Patches tested on every relevant AMD family out there."
Signed-off-by: Ingo Molnar <mingo@kernel.org>
| -rw-r--r-- | arch/x86/include/asm/mce.h | 8 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-internal.h | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 67 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 10 | 
4 files changed, 85 insertions, 2 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index fd38a23e729f..1f5a86d518db 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -116,6 +116,12 @@ struct mca_config {  	u32 rip_msr;  }; +struct mce_vendor_flags { +	__u64		overflow_recov	: 1, /* cpuid_ebx(80000007) */ +			__reserved_0	: 63; +}; +extern struct mce_vendor_flags mce_flags; +  extern struct mca_config mca_cfg;  extern void mce_register_decode_chain(struct notifier_block *nb);  extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -128,9 +134,11 @@ extern int mce_p5_enabled;  #ifdef CONFIG_X86_MCE  int mcheck_init(void);  void mcheck_cpu_init(struct cpuinfo_x86 *c); +void mcheck_vendor_init_severity(void);  #else  static inline int mcheck_init(void) { return 0; }  static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_vendor_init_severity(void) {}  #endif  #ifdef CONFIG_X86_ANCIENT_MCE diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index e12f0bfb45c1..fe32074b865b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -24,7 +24,7 @@ struct mce_bank {  	char			attrname[ATTR_LEN];	/* attribute name */  }; -int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); +extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);  struct dentry *mce_get_debugfs_dir(void);  extern struct mce_bank *mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 8bb433043a7f..155c9261d3ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -186,7 +186,62 @@ static int error_context(struct mce *m)  	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;  } -int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) +/* + * See AMD Error Scope Hierarchy table in a newer BKDG. For example + * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" + */ +static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) +{ +	enum context ctx = error_context(m); + +	/* Processor Context Corrupt, no need to fumble too much, die! */ +	if (m->status & MCI_STATUS_PCC) +		return MCE_PANIC_SEVERITY; + +	if (m->status & MCI_STATUS_UC) { + +		/* +		 * On older systems where overflow_recov flag is not present, we +		 * should simply panic if an error overflow occurs. If +		 * overflow_recov flag is present and set, then software can try +		 * to at least kill process to prolong system operation. +		 */ +		if (mce_flags.overflow_recov) { +			/* software can try to contain */ +			if (!(m->mcgstatus & MCG_STATUS_RIPV)) +				if (ctx == IN_KERNEL) +					return MCE_PANIC_SEVERITY; + +				/* kill current process */ +				return MCE_AR_SEVERITY; +		} else { +			/* at least one error was not logged */ +			if (m->status & MCI_STATUS_OVER) +				return MCE_PANIC_SEVERITY; +		} + +		/* +		 * For any other case, return MCE_UC_SEVERITY so that we log the +		 * error and exit #MC handler. +		 */ +		return MCE_UC_SEVERITY; +	} + +	/* +	 * deferred error: poll handler catches these and adds to mce_ring so +	 * memory-failure can take recovery actions. +	 */ +	if (m->status & MCI_STATUS_DEFERRED) +		return MCE_DEFERRED_SEVERITY; + +	/* +	 * corrected error: poll handler catches these and passes responsibility +	 * of decoding the error to EDAC +	 */ +	return MCE_KEEP_SEVERITY; +} + +static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)  {  	enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);  	enum context ctx = error_context(m); @@ -216,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)  	}  } +/* Default to mce_severity_intel */ +int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = +		    mce_severity_intel; + +void __init mcheck_vendor_init_severity(void) +{ +	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) +		mce_severity = mce_severity_amd; +} +  #ifdef CONFIG_DEBUG_FS  static void *s_start(struct seq_file *f, loff_t *pos)  { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8548b714a16b..c7df30748629 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);  DEFINE_PER_CPU(unsigned, mce_exception_count);  struct mce_bank *mce_banks __read_mostly; +struct mce_vendor_flags mce_flags __read_mostly;  struct mca_config mca_cfg __read_mostly = {  	.bootlog  = -1, @@ -1535,6 +1536,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)  			mce_banks[0].ctl = 0;  		/* +		 * overflow_recov is supported for F15h Models 00h-0fh +		 * even though we don't have a CPUID bit for it. +		 */ +		if (c->x86 == 0x15 && c->x86_model <= 0xf) +			mce_flags.overflow_recov = 1; + +		/*  		 * Turn off MC4_MISC thresholding banks on those models since  		 * they're not supported there.  		 */ @@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)  		break;  	case X86_VENDOR_AMD:  		mce_amd_feature_init(c); +		mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;  		break;  	default:  		break; @@ -2017,6 +2026,7 @@ __setup("mce", mcheck_enable);  int __init mcheck_init(void)  {  	mcheck_intel_therm_init(); +	mcheck_vendor_init_severity();  	return 0;  }  | 
