diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-02 11:47:07 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-02 11:47:07 -0700 |
commit | 86bbbebac1933e6e95e8234c4f7d220c5ddd38bc (patch) | |
tree | f81ce735fbb8c64d6188875baafe781ae0944a5a | |
parent | 486adcea4a63bec206cba6f0d7f301fb945ae9d3 (diff) | |
parent | e2efacb6a54ab54626da3507be1008d0040492cc (diff) |
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Ingo Molnar:
"The main changes in this cycle were:
- AMD MCE support/decoding improvements (Yazen Ghannam)
- general MCE header cleanups and reorganization (Borislav Petkov)"
* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
Revert "x86/mce/AMD: Collect error info even if valid bits are not set"
x86/MCE: Cleanup and complete struct mce fields definitions
x86/mce/AMD: Carve out SMCA get_block_address() code
x86/mce/AMD: Get address from already initialized block
x86/mce/AMD, EDAC/mce_amd: Enumerate Reserved SMCA bank type
x86/mce/AMD: Pass the bank number to smca_get_bank_type()
x86/mce/AMD: Collect error info even if valid bits are not set
x86/mce: Issue the 'mcelog --ascii' message only on !AMD
x86/mce: Convert 'struct mca_config' bools to a bitfield
x86/mce: Put private structures and definitions into the internal header
-rw-r--r-- | arch/x86/include/asm/mce.h | 53 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/mce.h | 52 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-internal.h | 59 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 20 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_amd.c | 78 | ||||
-rw-r--r-- | drivers/edac/mce_amd.c | 11 |
6 files changed, 156 insertions, 117 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 96ea4b5ba658..8c7b3e5a2d01 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -138,58 +138,6 @@ struct mce_log_buffer { struct mce entry[MCE_LOG_LEN]; }; -struct mca_config { - bool dont_log_ce; - bool cmci_disabled; - bool lmce_disabled; - bool ignore_ce; - bool disabled; - bool ser; - bool recovery; - bool bios_cmci_threshold; - u8 banks; - s8 bootlog; - int tolerant; - int monarch_timeout; - int panic_timeout; - u32 rip_msr; -}; - -struct mce_vendor_flags { - /* - * Indicates that overflow conditions are not fatal, when set. - */ - __u64 overflow_recov : 1, - - /* - * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and - * Recovery. It indicates support for data poisoning in HW and deferred - * error interrupts. - */ - succor : 1, - - /* - * (AMD) SMCA: This bit indicates support for Scalable MCA which expands - * the register space for each MCA bank and also increases number of - * banks. Also, to accommodate the new banks and registers, the MCA - * register space is moved to a new MSR range. - */ - smca : 1, - - __reserved_0 : 61; -}; - -struct mca_msr_regs { - u32 (*ctl) (int bank); - u32 (*status) (int bank); - u32 (*addr) (int bank); - u32 (*misc) (int bank); -}; - -extern struct mce_vendor_flags mce_flags; - -extern struct mca_msr_regs msr_ops; - enum mce_notifier_prios { MCE_PRIO_FIRST = INT_MAX, MCE_PRIO_SRAO = INT_MAX - 1, @@ -346,6 +294,7 @@ enum smca_bank_types { SMCA_IF, /* Instruction Fetch */ SMCA_L2_CACHE, /* L2 Cache */ SMCA_DE, /* Decoder Unit */ + SMCA_RESERVED, /* Reserved */ SMCA_EX, /* Execution Unit */ SMCA_FP, /* Floating Point */ SMCA_L3_CACHE, /* L3 Cache */ diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 435db58a7bad..955c2a2e1cf9 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -5,32 +5,36 @@ #include <linux/types.h> #include <linux/ioctl.h> -/* Fields are zero when not available */ +/* + * Fields are zero when not available. Also, this struct is shared with + * userspace mcelog and thus must keep existing fields at current offsets. + * Only add new fields to the end of the structure + */ struct mce { - __u64 status; - __u64 misc; - __u64 addr; - __u64 mcgstatus; - __u64 ip; - __u64 tsc; /* cpu time stamp counter */ - __u64 time; /* wall time_t when error was detected */ - __u8 cpuvendor; /* cpu vendor as encoded in system.h */ - __u8 inject_flags; /* software inject flags */ - __u8 severity; + __u64 status; /* Bank's MCi_STATUS MSR */ + __u64 misc; /* Bank's MCi_MISC MSR */ + __u64 addr; /* Bank's MCi_ADDR MSR */ + __u64 mcgstatus; /* Machine Check Global Status MSR */ + __u64 ip; /* Instruction Pointer when the error happened */ + __u64 tsc; /* CPU time stamp counter */ + __u64 time; /* Wall time_t when error was detected */ + __u8 cpuvendor; /* Kernel's X86_VENDOR enum */ + __u8 inject_flags; /* Software inject flags */ + __u8 severity; /* Error severity */ __u8 pad; - __u32 cpuid; /* CPUID 1 EAX */ - __u8 cs; /* code segment */ - __u8 bank; /* machine check bank */ - __u8 cpu; /* cpu number; obsolete; use extcpu now */ - __u8 finished; /* entry is valid */ - __u32 extcpu; /* linux cpu number that detected the error */ - __u32 socketid; /* CPU socket ID */ - __u32 apicid; /* CPU initial apic ID */ - __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ - __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ - __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ - __u64 ppin; /* Protected Processor Inventory Number */ - __u32 microcode;/* Microcode revision */ + __u32 cpuid; /* CPUID 1 EAX */ + __u8 cs; /* Code segment */ + __u8 bank; /* Machine check bank reporting the error */ + __u8 cpu; /* CPU number; obsoleted by extcpu */ + __u8 finished; /* Entry is valid */ + __u32 extcpu; /* Linux CPU number that detected the error */ + __u32 socketid; /* CPU socket ID */ + __u32 apicid; /* CPU initial APIC ID */ + __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ + __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ + __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + __u64 ppin; /* Protected Processor Inventory Number */ + __u32 microcode; /* Microcode revision */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index e956eb267061..374d1aa66952 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -113,8 +113,6 @@ static inline void mce_register_injector_chain(struct notifier_block *nb) { } static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif -extern struct mca_config mca_cfg; - #ifndef CONFIG_X86_64 /* * On 32-bit systems it would be difficult to safely unmap a poison page @@ -130,4 +128,61 @@ static inline void mce_unmap_kpfn(unsigned long pfn) {} #define mce_unmap_kpfn mce_unmap_kpfn #endif +struct mca_config { + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + + __u64 lmce_disabled : 1, + disabled : 1, + ser : 1, + recovery : 1, + bios_cmci_threshold : 1, + __reserved : 59; + + u8 banks; + s8 bootlog; + int tolerant; + int monarch_timeout; + int panic_timeout; + u32 rip_msr; +}; + +extern struct mca_config mca_cfg; + +struct mce_vendor_flags { + /* + * Indicates that overflow conditions are not fatal, when set. + */ + __u64 overflow_recov : 1, + + /* + * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and + * Recovery. It indicates support for data poisoning in HW and deferred + * error interrupts. + */ + succor : 1, + + /* + * (AMD) SMCA: This bit indicates support for Scalable MCA which expands + * the register space for each MCA bank and also increases number of + * banks. Also, to accommodate the new banks and registers, the MCA + * register space is moved to a new MSR range. + */ + smca : 1, + + __reserved_0 : 61; +}; + +extern struct mce_vendor_flags mce_flags; + +struct mca_msr_regs { + u32 (*ctl) (int bank); + u32 (*status) (int bank); + u32 (*addr) (int bank); + u32 (*misc) (int bank); +}; + +extern struct mca_msr_regs msr_ops; + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 466f47301334..3c1eec17312b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -273,7 +273,9 @@ static void __print_mce(struct mce *m) static void print_mce(struct mce *m) { __print_mce(m); - pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); + + if (m->cpuvendor != X86_VENDOR_AMD) + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -1516,7 +1518,7 @@ static int __mcheck_cpu_cap_init(void) mca_cfg.rip_msr = MSR_IA32_MCG_EIP; if (cap & MCG_SER_P) - mca_cfg.ser = true; + mca_cfg.ser = 1; return 0; } @@ -1824,12 +1826,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) return; if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { - mca_cfg.disabled = true; + mca_cfg.disabled = 1; return; } if (mce_gen_pool_init()) { - mca_cfg.disabled = true; + mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); return; } @@ -1907,11 +1909,11 @@ static int __init mcheck_enable(char *str) if (*str == '=') str++; if (!strcmp(str, "off")) - cfg->disabled = true; + cfg->disabled = 1; else if (!strcmp(str, "no_cmci")) cfg->cmci_disabled = true; else if (!strcmp(str, "no_lmce")) - cfg->lmce_disabled = true; + cfg->lmce_disabled = 1; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) @@ -1919,9 +1921,9 @@ static int __init mcheck_enable(char *str) else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) cfg->bootlog = (str[0] == 'b'); else if (!strcmp(str, "bios_cmci_threshold")) - cfg->bios_cmci_threshold = true; + cfg->bios_cmci_threshold = 1; else if (!strcmp(str, "recovery")) - cfg->recovery = true; + cfg->recovery = 1; else if (isdigit(str[0])) { if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); @@ -2403,7 +2405,7 @@ device_initcall_sync(mcheck_init_device); */ static int __init mcheck_disable(char *str) { - mca_cfg.disabled = true; + mca_cfg.disabled = 1; return 1; } __setup("nomce", mcheck_disable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 0f32ad242324..f7666eef4a87 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -82,6 +82,7 @@ static struct smca_bank_name smca_names[] = { [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_RESERVED] = { "reserved", "Reserved" }, [SMCA_EX] = { "execution_unit", "Execution Unit" }, [SMCA_FP] = { "floating_point", "Floating Point Unit" }, [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, @@ -110,14 +111,14 @@ const char *smca_get_long_name(enum smca_bank_types t) } EXPORT_SYMBOL_GPL(smca_get_long_name); -static enum smca_bank_types smca_get_bank_type(struct mce *m) +static enum smca_bank_types smca_get_bank_type(unsigned int bank) { struct smca_bank *b; - if (m->bank >= N_SMCA_BANK_TYPES) + if (bank >= MAX_NR_BANKS) return N_SMCA_BANK_TYPES; - b = &smca_banks[m->bank]; + b = &smca_banks[bank]; if (!b->hwid) return N_SMCA_BANK_TYPES; @@ -127,6 +128,9 @@ static enum smca_bank_types smca_get_bank_type(struct mce *m) static struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype, xec_bitmap } */ + /* Reserved type */ + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, + /* ZN Core (HWID=0xB0) MCA types */ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, @@ -427,35 +431,58 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } +static u32 smca_get_block_address(unsigned int cpu, unsigned int bank, + unsigned int block) +{ + u32 low, high; + u32 addr = 0; + + if (smca_get_bank_type(bank) == SMCA_RESERVED) + return addr; + + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + /* + * For SMCA enabled processors, BLKPTR field of the first MISC register + * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). + */ + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + return addr; + + if (!(low & MCI_CONFIG_MCAX)) + return addr; + + if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + (low & MASK_BLKPTR_LO)) + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + + return addr; +} + static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block) { u32 addr = 0, offset = 0; - if (mce_flags.smca) { - if (!block) { - addr = MSR_AMD64_SMCA_MCx_MISC(bank); - } else { - /* - * For SMCA enabled processors, BLKPTR field of the - * first MISC register (MCx_MISC0) indicates presence of - * additional MISC register set (MISC1-4). - */ - u32 low, high; + if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) + return addr; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return addr; + /* Get address from already initialized block. */ + if (per_cpu(threshold_banks, cpu)) { + struct threshold_bank *bankp = per_cpu(threshold_banks, cpu)[bank]; - if (!(low & MCI_CONFIG_MCAX)) - return addr; + if (bankp && bankp->blocks) { + struct threshold_block *blockp = &bankp->blocks[block]; - if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && - (low & MASK_BLKPTR_LO)) - addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + if (blockp) + return blockp->address; } - return addr; } + if (mce_flags.smca) + return smca_get_block_address(cpu, bank, block); + /* Fall back to method we used for older processors: */ switch (block) { case 0: @@ -760,7 +787,7 @@ bool amd_mce_is_memory_error(struct mce *m) u8 xec = (m->status >> 16) & 0x1f; if (mce_flags.smca) - return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0; + return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0; return m->bank == 4 && xec == 0x8; } @@ -1063,7 +1090,7 @@ static struct kobj_type threshold_ktype = { static const char *get_name(unsigned int bank, struct threshold_block *b) { - unsigned int bank_type; + enum smca_bank_types bank_type; if (!mce_flags.smca) { if (b && bank == 4) @@ -1072,11 +1099,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - if (!smca_banks[bank].hwid) + bank_type = smca_get_bank_type(bank); + if (bank_type >= N_SMCA_BANK_TYPES) return NULL; - bank_type = smca_banks[bank].hwid->bank_type; - if (b && bank_type == SMCA_UMC) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) return smca_umc_block_names[b->block]; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index a11a671c7a38..2ab4d61ee47e 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -854,21 +854,24 @@ static void decode_mc6_mce(struct mce *m) static void decode_smca_error(struct mce *m) { struct smca_hwid *hwid; - unsigned int bank_type; + enum smca_bank_types bank_type; const char *ip_name; u8 xec = XEC(m->status, xec_mask); if (m->bank >= ARRAY_SIZE(smca_banks)) return; - if (x86_family(m->cpuid) >= 0x17 && m->bank == 4) - pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n"); - hwid = smca_banks[m->bank].hwid; if (!hwid) return; bank_type = hwid->bank_type; + + if (bank_type == SMCA_RESERVED) { + pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank); + return; + } + ip_name = smca_get_long_name(bank_type); pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec); |