diff options
author | Quan Nguyen <quan@os.amperecomputing.com> | 2022-10-31 09:44:41 +0700 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2022-11-10 19:02:43 +0100 |
commit | 4a4a4e9ebaa3ce903a3cdf8bb173eeaf87828cea (patch) | |
tree | a4f8d334be1e8ec88b8bb68d1e9b69af8114603f | |
parent | c08645ea215c446ceb21029fe5416e6a62cbbed7 (diff) |
misc: smpro-errmon: Add Ampere's SMpro error monitor driver
Add Ampere's SMpro error monitor driver for monitoring and reporting
RAS-related errors as reported by SMpro co-processor found on Ampere's
Altra processor family.
Signed-off-by: Quan Nguyen <quan@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20221031024442.2490881-3-quan@os.amperecomputing.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r-- | Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro | 264 | ||||
-rw-r--r-- | drivers/misc/Kconfig | 12 | ||||
-rw-r--r-- | drivers/misc/Makefile | 1 | ||||
-rw-r--r-- | drivers/misc/smpro-errmon.c | 529 |
4 files changed, 806 insertions, 0 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro b/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro new file mode 100644 index 000000000000..2b84dc8c3149 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro @@ -0,0 +1,264 @@ +What: /sys/bus/platform/devices/smpro-errmon.*/error_[core|mem|pcie|other]_[ce|ue] +KernelVersion: 6.1 +Contact: Quan Nguyen <quan@os.amperecomputing.com> +Description: + (RO) Contains the 48-byte Ampere (Vendor-Specific) Error Record printed + in hex format according to the table below: + + +--------+---------------+-------------+------------------------------------------------------------+ + | Offset | Field | Size (byte) | Description | + +--------+---------------+-------------+------------------------------------------------------------+ + | 00 | Error Type | 1 | See :ref:`the table below <smpro-error-types>` for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 01 | Subtype | 1 | See :ref:`the table below <smpro-error-types>` for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 02 | Instance | 2 | See :ref:`the table below <smpro-error-types>` for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 04 | Error status | 4 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 08 | Error Address | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 16 | Error Misc 0 | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 24 | Error Misc 1 | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 32 | Error Misc 2 | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 40 | Error Misc 3 | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + + The table below defines the value of error types, their subtype, subcomponent and instance: + + .. _smpro-error-types: + + +-----------------+------------+----------+----------------+----------------------------------------+ + | Error Group | Error Type | Sub type | Sub component | Instance | + +-----------------+------------+----------+----------------+----------------------------------------+ + | CPM (core) | 0 | 0 | Snoop-Logic | CPM # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | CPM (core) | 0 | 2 | Armv8 Core 1 | CPM # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 1 | ERR1 | MCU # \| SLOT << 11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 2 | ERR2 | MCU # \| SLOT << 11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 3 | ERR3 | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 4 | ERR4 | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 5 | ERR5 | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 6 | ERR6 | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 7 | Link Error | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | Mesh (other) | 2 | 0 | Cross Point | X \| (Y << 5) \| NS <<11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | Mesh (other) | 2 | 1 | Home Node(IO) | X \| (Y << 5) \| NS <<11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | Mesh (other) | 2 | 2 | Home Node(Mem) | X \| (Y << 5) \| NS <<11 \| device<<12 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | Mesh (other) | 2 | 4 | CCIX Node | X \| (Y << 5) \| NS <<11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | 2P Link (other) | 3 | 0 | N/A | Altra 2P Link # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 0 | ERR0 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 1 | ERR1 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 2 | ERR2 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 3 | ERR3 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 4 | ERR4 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 5 | ERR5 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 6 | ERR6 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 7 | ERR7 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 8 | ERR8 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 9 | ERR9 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 10 | ERR10 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 11 | ERR11 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 12 | ERR12 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 13-21 | ERR13 | RC # + 1 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TCU | 100 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU0 | 0 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU1 | 1 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU2 | 2 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU3 | 3 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU4 | 4 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU5 | 5 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU6 | 6 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU7 | 7 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU8 | 8 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU9 | 9 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe AER (pcie) | 7 | Root | 0 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe AER (pcie) | 7 | Device | 1 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe RC (pcie) | 8 | RCA HB | 0 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe RC (pcie) | 8 | RCB HB | 1 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe RC (pcie) | 8 | RASDP | 8 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | OCM (other) | 9 | ERR0 | 0 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | OCM (other) | 9 | ERR1 | 1 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | OCM (other) | 9 | ERR2 | 2 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMpro (other) | 10 | ERR0 | 0 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMpro (other) | 10 | ERR1 | 1 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMpro (other) | 10 | MPA_ERR | 2 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PMpro (other) | 11 | ERR0 | 0 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PMpro (other) | 11 | ERR1 | 1 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PMpro (other) | 11 | MPA_ERR | 2 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + + Example:: + + # cat error_other_ue + 880807001e004010401040101500000001004010401040100c0000000000000000000000000000000000000000000000 + + The detail of each sysfs entries is as below: + + +-------------+---------------------------------------------------------+----------------------------------+ + | Error | Sysfs entry | Description (when triggered) | + +-------------+---------------------------------------------------------+----------------------------------+ + | Core's CE | /sys/bus/platform/devices/smpro-errmon.*/error_core_ce | Core has CE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Core's UE | /sys/bus/platform/devices/smpro-errmon.*/error_core_ue | Core has UE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ce | Memory has CE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ue | Memory has UE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | PCIe's CE | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ce | any PCIe controller has CE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | PCIe's UE | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ue | any PCIe controller has UE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Other's CE | /sys/bus/platform/devices/smpro-errmon.*/error_other_ce | any other CE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Other's UE | /sys/bus/platform/devices/smpro-errmon.*/error_other_ue | any other UE error | + +-------------+---------------------------------------------------------+----------------------------------+ + + UE: Uncorrect-able Error + CE: Correct-able Error + + For details, see section `3.3 Ampere (Vendor-Specific) Error Record Formats, + Altra Family RAS Supplement`. + + +What: /sys/bus/platform/devices/smpro-errmon.*/overflow_[core|mem|pcie|other]_[ce|ue] +KernelVersion: 6.1 +Contact: Quan Nguyen <quan@os.amperecomputing.com> +Description: + (RO) Return the overflow status of each type HW error reported: + + - 0 : No overflow + - 1 : There is an overflow and the oldest HW errors are dropped + + The detail of each sysfs entries is as below: + + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Overflow | Sysfs entry | Description | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Core's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ce | Core CE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Core's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ue | Core UE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ce | Memory CE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ue | Memory UE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | PCIe's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ce | any PCIe controller CE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | PCIe's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ue | any PCIe controller UE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Other's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ce| any other CE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Other's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ue| other UE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + + where: + + - UE: Uncorrect-able Error + - CE: Correct-able Error + +What: /sys/bus/platform/devices/smpro-errmon.*/[error|warn]_[smpro|pmpro] +KernelVersion: 6.1 +Contact: Quan Nguyen <quan@os.amperecomputing.com> +Description: + (RO) Contains the internal firmware error/warning printed as hex format. + + The detail of each sysfs entries is as below: + + +---------------+------------------------------------------------------+--------------------------+ + | Error | Sysfs entry | Description | + +---------------+------------------------------------------------------+--------------------------+ + | SMpro error | /sys/bus/platform/devices/smpro-errmon.*/error_smpro | system has SMpro error | + +---------------+------------------------------------------------------+--------------------------+ + | SMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_smpro | system has SMpro warning | + +---------------+------------------------------------------------------+--------------------------+ + | PMpro error | /sys/bus/platform/devices/smpro-errmon.*/error_pmpro | system has PMpro error | + +---------------+------------------------------------------------------+--------------------------+ + | PMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_pmpro | system has PMpro warning | + +---------------+------------------------------------------------------+--------------------------+ + + For details, see section `5.10 RAS Internal Error Register Definitions, + Altra Family Soc BMC Interface Specification`. + +What: /sys/bus/platform/devices/smpro-errmon.*/event_[vrd_warn_fault|vrd_hot|dimm_hot] +KernelVersion: 6.1 +Contact: Quan Nguyen <quan@os.amperecomputing.com> +Description: + (RO) Contains the detail information in case of VRD/DIMM warning/hot events + in hex format as below:: + + AAAA + + where: + + - ``AAAA``: The event detail information data + + The detail of each sysfs entries is as below: + + +---------------+---------------------------------------------------------------+---------------------+ + | Event | Sysfs entry | Description | + +---------------+---------------------------------------------------------------+---------------------+ + | VRD HOT | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_hot | VRD Hot | + +---------------+---------------------------------------------------------------+---------------------+ + | VR Warn/Fault | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_warn_fault | VR Warning or Fault | + +---------------+---------------------------------------------------------------+---------------------+ + | DIMM HOT | /sys/bus/platform/devices/smpro-errmon.*/event_dimm_hot | DIMM Hot | + +---------------+---------------------------------------------------------------+---------------------+ + + For more details, see section `5.7 GPI Status Registers, + Altra Family Soc BMC Interface Specification`. + diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 358ad56f6524..b9ceee949dab 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -176,6 +176,18 @@ config SGI_XP this feature will allow for direct communication between SSIs based on a network adapter and DMA messaging. +config SMPRO_ERRMON + tristate "Ampere Computing SMPro error monitor driver" + depends on MFD_SMPRO || COMPILE_TEST + help + Say Y here to get support for the SMpro error monitor function + provided by Ampere Computing's Altra and Altra Max SoCs. Upon + loading, the driver creates sysfs files which can be use to gather + multiple HW error data reported via read and write system calls. + + To compile this driver as a module, say M here. The driver will be + called smpro-errmon. + config CS5535_MFGPT tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support" depends on MFD_CS5535 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index ac9b3e757ba1..bbe24d4511a3 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o obj-$(CONFIG_KGDB_TESTS) += kgdbts.o obj-$(CONFIG_SGI_XP) += sgi-xp/ obj-$(CONFIG_SGI_GRU) += sgi-gru/ +obj-$(CONFIG_SMPRO_ERRMON) += smpro-errmon.o obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o obj-$(CONFIG_GEHC_ACHC) += gehc-achc.o obj-$(CONFIG_HP_ILO) += hpilo.o diff --git a/drivers/misc/smpro-errmon.c b/drivers/misc/smpro-errmon.c new file mode 100644 index 000000000000..d1431d419aa4 --- /dev/null +++ b/drivers/misc/smpro-errmon.c @@ -0,0 +1,529 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ampere Computing SoC's SMpro Error Monitoring Driver + * + * Copyright (c) 2022, Ampere Computing LLC + * + */ + +#include <linux/i2c.h> +#include <linux/mod_devicetable.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/regmap.h> + +/* GPI RAS Error Registers */ +#define GPI_RAS_ERR 0x7E + +/* Core and L2C Error Registers */ +#define CORE_CE_ERR_CNT 0x80 +#define CORE_CE_ERR_LEN 0x81 +#define CORE_CE_ERR_DATA 0x82 +#define CORE_UE_ERR_CNT 0x83 +#define CORE_UE_ERR_LEN 0x84 +#define CORE_UE_ERR_DATA 0x85 + +/* Memory Error Registers */ +#define MEM_CE_ERR_CNT 0x90 +#define MEM_CE_ERR_LEN 0x91 +#define MEM_CE_ERR_DATA 0x92 +#define MEM_UE_ERR_CNT 0x93 +#define MEM_UE_ERR_LEN 0x94 +#define MEM_UE_ERR_DATA 0x95 + +/* RAS Error/Warning Registers */ +#define ERR_SMPRO_TYPE 0xA0 +#define ERR_PMPRO_TYPE 0xA1 +#define ERR_SMPRO_INFO_LO 0xA2 +#define ERR_SMPRO_INFO_HI 0xA3 +#define ERR_SMPRO_DATA_LO 0xA4 +#define ERR_SMPRO_DATA_HI 0xA5 +#define WARN_SMPRO_INFO_LO 0xAA +#define WARN_SMPRO_INFO_HI 0xAB +#define ERR_PMPRO_INFO_LO 0xA6 +#define ERR_PMPRO_INFO_HI 0xA7 +#define ERR_PMPRO_DATA_LO 0xA8 +#define ERR_PMPRO_DATA_HI 0xA9 +#define WARN_PMPRO_INFO_LO 0xAC +#define WARN_PMPRO_INFO_HI 0xAD + +/* PCIE Error Registers */ +#define PCIE_CE_ERR_CNT 0xC0 +#define PCIE_CE_ERR_LEN 0xC1 +#define PCIE_CE_ERR_DATA 0xC2 +#define PCIE_UE_ERR_CNT 0xC3 +#define PCIE_UE_ERR_LEN 0xC4 +#define PCIE_UE_ERR_DATA 0xC5 + +/* Other Error Registers */ +#define OTHER_CE_ERR_CNT 0xD0 +#define OTHER_CE_ERR_LEN 0xD1 +#define OTHER_CE_ERR_DATA 0xD2 +#define OTHER_UE_ERR_CNT 0xD8 +#define OTHER_UE_ERR_LEN 0xD9 +#define OTHER_UE_ERR_DATA 0xDA + +/* Event Data Registers */ +#define VRD_WARN_FAULT_EVENT_DATA 0x78 +#define VRD_HOT_EVENT_DATA 0x79 +#define DIMM_HOT_EVENT_DATA 0x7A + +#define MAX_READ_BLOCK_LENGTH 48 + +#define RAS_SMPRO_ERR 0 +#define RAS_PMPRO_ERR 1 + +enum RAS_48BYTES_ERR_TYPES { + CORE_CE_ERR, + CORE_UE_ERR, + MEM_CE_ERR, + MEM_UE_ERR, + PCIE_CE_ERR, + PCIE_UE_ERR, + OTHER_CE_ERR, + OTHER_UE_ERR, + NUM_48BYTES_ERR_TYPE, +}; + +struct smpro_error_hdr { + u8 count; /* Number of the RAS errors */ + u8 len; /* Number of data bytes */ + u8 data; /* Start of 48-byte data */ + u8 max_cnt; /* Max num of errors */ +}; + +/* + * Included Address of registers to get Count, Length of data and Data + * of the 48 bytes error data + */ +static struct smpro_error_hdr smpro_error_table[] = { + [CORE_CE_ERR] = { + .count = CORE_CE_ERR_CNT, + .len = CORE_CE_ERR_LEN, + .data = CORE_CE_ERR_DATA, + .max_cnt = 32 + }, + [CORE_UE_ERR] = { + .count = CORE_UE_ERR_CNT, + .len = CORE_UE_ERR_LEN, + .data = CORE_UE_ERR_DATA, + .max_cnt = 32 + }, + [MEM_CE_ERR] = { + .count = MEM_CE_ERR_CNT, + .len = MEM_CE_ERR_LEN, + .data = MEM_CE_ERR_DATA, + .max_cnt = 16 + }, + [MEM_UE_ERR] = { + .count = MEM_UE_ERR_CNT, + .len = MEM_UE_ERR_LEN, + .data = MEM_UE_ERR_DATA, + .max_cnt = 16 + }, + [PCIE_CE_ERR] = { + .count = PCIE_CE_ERR_CNT, + .len = PCIE_CE_ERR_LEN, + .data = PCIE_CE_ERR_DATA, + .max_cnt = 96 + }, + [PCIE_UE_ERR] = { + .count = PCIE_UE_ERR_CNT, + .len = PCIE_UE_ERR_LEN, + .data = PCIE_UE_ERR_DATA, + .max_cnt = 96 + }, + [OTHER_CE_ERR] = { + .count = OTHER_CE_ERR_CNT, + .len = OTHER_CE_ERR_LEN, + .data = OTHER_CE_ERR_DATA, + .max_cnt = 8 + }, + [OTHER_UE_ERR] = { + .count = OTHER_UE_ERR_CNT, + .len = OTHER_UE_ERR_LEN, + .data = OTHER_UE_ERR_DATA, + .max_cnt = 8 + }, +}; + +/* + * List of SCP registers which are used to get + * one type of RAS Internal errors. + */ +struct smpro_int_error_hdr { + u8 type; + u8 info_l; + u8 info_h; + u8 data_l; + u8 data_h; + u8 warn_l; + u8 warn_h; +}; + +static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = { + [RAS_SMPRO_ERR] = { + .type = ERR_SMPRO_TYPE, + .info_l = ERR_SMPRO_INFO_LO, + .info_h = ERR_SMPRO_INFO_HI, + .data_l = ERR_SMPRO_DATA_LO, + .data_h = ERR_SMPRO_DATA_HI, + .warn_l = WARN_SMPRO_INFO_LO, + .warn_h = WARN_SMPRO_INFO_HI, + }, + [RAS_PMPRO_ERR] = { + .type = ERR_PMPRO_TYPE, + .info_l = ERR_PMPRO_INFO_LO, + .info_h = ERR_PMPRO_INFO_HI, + .data_l = ERR_PMPRO_DATA_LO, + .data_h = ERR_PMPRO_DATA_HI, + .warn_l = WARN_PMPRO_INFO_LO, + .warn_h = WARN_PMPRO_INFO_HI, + }, +}; + +struct smpro_errmon { + struct regmap *regmap; +}; + +enum EVENT_TYPES { + VRD_WARN_FAULT_EVENT, + VRD_HOT_EVENT, + DIMM_HOT_EVENT, + NUM_EVENTS_TYPE, +}; + +/* Included Address of event source and data registers */ +static u8 smpro_event_table[NUM_EVENTS_TYPE] = { + VRD_WARN_FAULT_EVENT_DATA, + VRD_HOT_EVENT_DATA, + DIMM_HOT_EVENT_DATA, +}; + +static ssize_t smpro_event_data_read(struct device *dev, + struct device_attribute *da, char *buf, + int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + s32 event_data; + int ret; + + ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data); + if (ret) + return ret; + /* Clear event after read */ + if (event_data != 0) + regmap_write(errmon->regmap, smpro_event_table[channel], event_data); + + return sysfs_emit(buf, "%04x\n", event_data); +} + +static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + struct smpro_error_hdr *err_info; + s32 err_count; + int ret; + + err_info = &smpro_error_table[channel]; + + ret = regmap_read(errmon->regmap, err_info->count, &err_count); + if (ret) + return ret; + + /* Bit 8 indicates the overflow status */ + return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0); +} + +static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + unsigned char err_data[MAX_READ_BLOCK_LENGTH]; + struct smpro_error_hdr *err_info; + s32 err_count, err_length; + int ret; + + err_info = &smpro_error_table[channel]; + + ret = regmap_read(errmon->regmap, err_info->count, &err_count); + /* Error count is the low byte */ + err_count &= 0xff; + if (ret || !err_count || err_count > err_info->max_cnt) + return ret; + + ret = regmap_read(errmon->regmap, err_info->len, &err_length); + if (ret || err_length <= 0) + return ret; + + if (err_length > MAX_READ_BLOCK_LENGTH) + err_length = MAX_READ_BLOCK_LENGTH; + + memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH); + ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length); + if (ret < 0) + return ret; + + /* clear the error */ + ret = regmap_write(errmon->regmap, err_info->count, 0x100); + if (ret) + return ret; + /* + * The output of Core/Memory/PCIe/Others UE/CE errors follows the format + * specified in section 5.8.1 CE/UE Error Data record in + * Altra SOC BMC Interface specification. + */ + return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data); +} + +/* + * Output format: + * <4-byte hex value of error info><4-byte hex value of error extensive data> + * Where: + * + error info : The error information + * + error data : Extensive data (32 bits) + * Reference to section 5.10 RAS Internal Error Register Definition in + * Altra SOC BMC Interface specification + */ +static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + struct smpro_int_error_hdr *err_info; + unsigned int err[4] = { 0 }; + unsigned int err_type; + unsigned int val; + int ret; + + /* read error status */ + ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); + if (ret) + return ret; + + if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || + (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) + return 0; + + err_info = &list_smpro_int_error_hdr[channel]; + ret = regmap_read(errmon->regmap, err_info->type, &val); + if (ret) + return ret; + + err_type = (val & BIT(1)) ? BIT(1) : + (val & BIT(2)) ? BIT(2) : 0; + + if (!err_type) + return 0; + + ret = regmap_read(errmon->regmap, err_info->info_l, err + 1); + if (ret) + return ret; + + ret = regmap_read(errmon->regmap, err_info->info_h, err); + if (ret) + return ret; + + if (err_type & BIT(2)) { + /* Error with data type */ + ret = regmap_read(errmon->regmap, err_info->data_l, err + 3); + if (ret) + return ret; + + ret = regmap_read(errmon->regmap, err_info->data_h, err + 2); + if (ret) + return ret; + } + + /* clear the read errors */ + ret = regmap_write(errmon->regmap, err_info->type, err_type); + if (ret) + return ret; + + return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err); +} + +/* + * Output format: + * <4-byte hex value of warining info> + * Reference to section 5.10 RAS Internal Error Register Definition in + * Altra SOC BMC Interface specification + */ +static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + struct smpro_int_error_hdr *err_info; + unsigned int warn[2] = { 0 }; + unsigned int val; + int ret; + + /* read error status */ + ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); + if (ret) + return ret; + + if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || + (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) + return 0; + + err_info = &list_smpro_int_error_hdr[channel]; + ret = regmap_read(errmon->regmap, err_info->type, &val); + if (ret) + return ret; + + if (!(val & BIT(0))) + return 0; + + ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1); + if (ret) + return ret; + + ret = regmap_read(errmon->regmap, err_info->warn_h, warn); + if (ret) + return ret; + + /* clear the warning */ + ret = regmap_write(errmon->regmap, err_info->type, BIT(0)); + if (ret) + return ret; + + return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn); +} + +#define ERROR_OVERFLOW_RO(_error, _index) \ + static ssize_t overflow_##_error##_show(struct device *dev, \ + struct device_attribute *da, \ + char *buf) \ + { \ + return smpro_overflow_data_read(dev, da, buf, _index); \ + } \ + static DEVICE_ATTR_RO(overflow_##_error) + +ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR); +ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR); +ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR); +ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR); +ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR); +ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR); +ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR); +ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR); + +#define ERROR_RO(_error, _index) \ + static ssize_t error_##_error##_show(struct device *dev, \ + struct device_attribute *da, \ + char *buf) \ + { \ + return smpro_error_data_read(dev, da, buf, _index); \ + } \ + static DEVICE_ATTR_RO(error_##_error) + +ERROR_RO(core_ce, CORE_CE_ERR); +ERROR_RO(core_ue, CORE_UE_ERR); +ERROR_RO(mem_ce, MEM_CE_ERR); +ERROR_RO(mem_ue, MEM_UE_ERR); +ERROR_RO(pcie_ce, PCIE_CE_ERR); +ERROR_RO(pcie_ue, PCIE_UE_ERR); +ERROR_RO(other_ce, OTHER_CE_ERR); +ERROR_RO(other_ue, OTHER_UE_ERR); + +static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR); +} +static DEVICE_ATTR_RO(error_smpro); + +static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR); +} +static DEVICE_ATTR_RO(error_pmpro); + +static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR); +} +static DEVICE_ATTR_RO(warn_smpro); + +static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR); +} +static DEVICE_ATTR_RO(warn_pmpro); + +#define EVENT_RO(_event, _index) \ + static ssize_t event_##_event##_show(struct device *dev, \ + struct device_attribute *da, \ + char *buf) \ + { \ + return smpro_event_data_read(dev, da, buf, _index); \ + } \ + static DEVICE_ATTR_RO(event_##_event) + +EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT); +EVENT_RO(vrd_hot, VRD_HOT_EVENT); +EVENT_RO(dimm_hot, DIMM_HOT_EVENT); + +static struct attribute *smpro_errmon_attrs[] = { + &dev_attr_overflow_core_ce.attr, + &dev_attr_overflow_core_ue.attr, + &dev_attr_overflow_mem_ce.attr, + &dev_attr_overflow_mem_ue.attr, + &dev_attr_overflow_pcie_ce.attr, + &dev_attr_overflow_pcie_ue.attr, + &dev_attr_overflow_other_ce.attr, + &dev_attr_overflow_other_ue.attr, + &dev_attr_error_core_ce.attr, + &dev_attr_error_core_ue.attr, + &dev_attr_error_mem_ce.attr, + &dev_attr_error_mem_ue.attr, + &dev_attr_error_pcie_ce.attr, + &dev_attr_error_pcie_ue.attr, + &dev_attr_error_other_ce.attr, + &dev_attr_error_other_ue.attr, + &dev_attr_error_smpro.attr, + &dev_attr_error_pmpro.attr, + &dev_attr_warn_smpro.attr, + &dev_attr_warn_pmpro.attr, + &dev_attr_event_vrd_warn_fault.attr, + &dev_attr_event_vrd_hot.attr, + &dev_attr_event_dimm_hot.attr, + NULL +}; + +ATTRIBUTE_GROUPS(smpro_errmon); + +static int smpro_errmon_probe(struct platform_device *pdev) +{ + struct smpro_errmon *errmon; + + errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL); + if (!errmon) + return -ENOMEM; + + platform_set_drvdata(pdev, errmon); + + errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!errmon->regmap) + return -ENODEV; + + return 0; +} + +static struct platform_driver smpro_errmon_driver = { + .probe = smpro_errmon_probe, + .driver = { + .name = "smpro-errmon", + .dev_groups = smpro_errmon_groups, + }, +}; + +module_platform_driver(smpro_errmon_driver); + +MODULE_AUTHOR("Tung Nguyen <tung.nguyen@amperecomputing.com>"); +MODULE_AUTHOR("Thinh Pham <thinh.pham@amperecomputing.com>"); +MODULE_AUTHOR("Hoang Nguyen <hnguyen@amperecomputing.com>"); +MODULE_AUTHOR("Thu Nguyen <thu@os.amperecomputing.com>"); +MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>"); +MODULE_DESCRIPTION("Ampere Altra SMpro driver"); +MODULE_LICENSE("GPL"); |