diff options
Diffstat (limited to 'drivers/misc/smpro-errmon.c')
| -rw-r--r-- | drivers/misc/smpro-errmon.c | 529 | 
1 files changed, 529 insertions, 0 deletions
diff --git a/drivers/misc/smpro-errmon.c b/drivers/misc/smpro-errmon.c new file mode 100644 index 000000000000..d1431d419aa4 --- /dev/null +++ b/drivers/misc/smpro-errmon.c @@ -0,0 +1,529 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ampere Computing SoC's SMpro Error Monitoring Driver + * + * Copyright (c) 2022, Ampere Computing LLC + * + */ + +#include <linux/i2c.h> +#include <linux/mod_devicetable.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/regmap.h> + +/* GPI RAS Error Registers */ +#define GPI_RAS_ERR		0x7E + +/* Core and L2C Error Registers */ +#define CORE_CE_ERR_CNT		0x80 +#define CORE_CE_ERR_LEN		0x81 +#define CORE_CE_ERR_DATA	0x82 +#define CORE_UE_ERR_CNT		0x83 +#define CORE_UE_ERR_LEN		0x84 +#define CORE_UE_ERR_DATA	0x85 + +/* Memory Error Registers */ +#define MEM_CE_ERR_CNT		0x90 +#define MEM_CE_ERR_LEN		0x91 +#define MEM_CE_ERR_DATA		0x92 +#define MEM_UE_ERR_CNT		0x93 +#define MEM_UE_ERR_LEN		0x94 +#define MEM_UE_ERR_DATA		0x95 + +/* RAS Error/Warning Registers */ +#define ERR_SMPRO_TYPE		0xA0 +#define ERR_PMPRO_TYPE		0xA1 +#define ERR_SMPRO_INFO_LO	0xA2 +#define ERR_SMPRO_INFO_HI	0xA3 +#define ERR_SMPRO_DATA_LO	0xA4 +#define ERR_SMPRO_DATA_HI	0xA5 +#define WARN_SMPRO_INFO_LO	0xAA +#define WARN_SMPRO_INFO_HI	0xAB +#define ERR_PMPRO_INFO_LO	0xA6 +#define ERR_PMPRO_INFO_HI	0xA7 +#define ERR_PMPRO_DATA_LO	0xA8 +#define ERR_PMPRO_DATA_HI	0xA9 +#define WARN_PMPRO_INFO_LO	0xAC +#define WARN_PMPRO_INFO_HI	0xAD + +/* PCIE Error Registers */ +#define PCIE_CE_ERR_CNT		0xC0 +#define PCIE_CE_ERR_LEN		0xC1 +#define PCIE_CE_ERR_DATA	0xC2 +#define PCIE_UE_ERR_CNT		0xC3 +#define PCIE_UE_ERR_LEN		0xC4 +#define PCIE_UE_ERR_DATA	0xC5 + +/* Other Error Registers */ +#define OTHER_CE_ERR_CNT	0xD0 +#define OTHER_CE_ERR_LEN	0xD1 +#define OTHER_CE_ERR_DATA	0xD2 +#define OTHER_UE_ERR_CNT	0xD8 +#define OTHER_UE_ERR_LEN	0xD9 +#define OTHER_UE_ERR_DATA	0xDA + +/* Event Data Registers */ +#define VRD_WARN_FAULT_EVENT_DATA	0x78 +#define VRD_HOT_EVENT_DATA		0x79 +#define DIMM_HOT_EVENT_DATA		0x7A + +#define MAX_READ_BLOCK_LENGTH	48 + +#define RAS_SMPRO_ERR		0 +#define RAS_PMPRO_ERR		1 + +enum RAS_48BYTES_ERR_TYPES { +	CORE_CE_ERR, +	CORE_UE_ERR, +	MEM_CE_ERR, +	MEM_UE_ERR, +	PCIE_CE_ERR, +	PCIE_UE_ERR, +	OTHER_CE_ERR, +	OTHER_UE_ERR, +	NUM_48BYTES_ERR_TYPE, +}; + +struct smpro_error_hdr { +	u8 count;	/* Number of the RAS errors */ +	u8 len;		/* Number of data bytes */ +	u8 data;	/* Start of 48-byte data */ +	u8 max_cnt;	/* Max num of errors */ +}; + +/* + * Included Address of registers to get Count, Length of data and Data + * of the 48 bytes error data + */ +static struct smpro_error_hdr smpro_error_table[] = { +	[CORE_CE_ERR] = { +		.count = CORE_CE_ERR_CNT, +		.len = CORE_CE_ERR_LEN, +		.data = CORE_CE_ERR_DATA, +		.max_cnt = 32 +	}, +	[CORE_UE_ERR] = { +		.count = CORE_UE_ERR_CNT, +		.len = CORE_UE_ERR_LEN, +		.data = CORE_UE_ERR_DATA, +		.max_cnt = 32 +	}, +	[MEM_CE_ERR] = { +		.count = MEM_CE_ERR_CNT, +		.len = MEM_CE_ERR_LEN, +		.data = MEM_CE_ERR_DATA, +		.max_cnt = 16 +	}, +	[MEM_UE_ERR] = { +		.count = MEM_UE_ERR_CNT, +		.len = MEM_UE_ERR_LEN, +		.data = MEM_UE_ERR_DATA, +		.max_cnt = 16 +	}, +	[PCIE_CE_ERR] = { +		.count = PCIE_CE_ERR_CNT, +		.len = PCIE_CE_ERR_LEN, +		.data = PCIE_CE_ERR_DATA, +		.max_cnt = 96 +	}, +	[PCIE_UE_ERR] = { +		.count = PCIE_UE_ERR_CNT, +		.len = PCIE_UE_ERR_LEN, +		.data = PCIE_UE_ERR_DATA, +		.max_cnt = 96 +	}, +	[OTHER_CE_ERR] = { +		.count = OTHER_CE_ERR_CNT, +		.len = OTHER_CE_ERR_LEN, +		.data = OTHER_CE_ERR_DATA, +		.max_cnt = 8 +	}, +	[OTHER_UE_ERR] = { +		.count = OTHER_UE_ERR_CNT, +		.len = OTHER_UE_ERR_LEN, +		.data = OTHER_UE_ERR_DATA, +		.max_cnt = 8 +	}, +}; + +/* + * List of SCP registers which are used to get + * one type of RAS Internal errors. + */ +struct smpro_int_error_hdr { +	u8 type; +	u8 info_l; +	u8 info_h; +	u8 data_l; +	u8 data_h; +	u8 warn_l; +	u8 warn_h; +}; + +static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = { +	[RAS_SMPRO_ERR] = { +		.type = ERR_SMPRO_TYPE, +		.info_l = ERR_SMPRO_INFO_LO, +		.info_h = ERR_SMPRO_INFO_HI, +		.data_l = ERR_SMPRO_DATA_LO, +		.data_h = ERR_SMPRO_DATA_HI, +		.warn_l = WARN_SMPRO_INFO_LO, +		.warn_h = WARN_SMPRO_INFO_HI, +	}, +	[RAS_PMPRO_ERR] = { +		.type = ERR_PMPRO_TYPE, +		.info_l = ERR_PMPRO_INFO_LO, +		.info_h = ERR_PMPRO_INFO_HI, +		.data_l = ERR_PMPRO_DATA_LO, +		.data_h = ERR_PMPRO_DATA_HI, +		.warn_l = WARN_PMPRO_INFO_LO, +		.warn_h = WARN_PMPRO_INFO_HI, +	}, +}; + +struct smpro_errmon { +	struct regmap *regmap; +}; + +enum EVENT_TYPES { +	VRD_WARN_FAULT_EVENT, +	VRD_HOT_EVENT, +	DIMM_HOT_EVENT, +	NUM_EVENTS_TYPE, +}; + +/* Included Address of event source and data registers */ +static u8 smpro_event_table[NUM_EVENTS_TYPE] = { +	VRD_WARN_FAULT_EVENT_DATA, +	VRD_HOT_EVENT_DATA, +	DIMM_HOT_EVENT_DATA, +}; + +static ssize_t smpro_event_data_read(struct device *dev, +				     struct device_attribute *da, char *buf, +				     int channel) +{ +	struct smpro_errmon *errmon = dev_get_drvdata(dev); +	s32 event_data; +	int ret; + +	ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data); +	if (ret) +		return ret; +	/* Clear event after read */ +	if (event_data != 0) +		regmap_write(errmon->regmap, smpro_event_table[channel], event_data); + +	return sysfs_emit(buf, "%04x\n", event_data); +} + +static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da, +					char *buf, int channel) +{ +	struct smpro_errmon *errmon = dev_get_drvdata(dev); +	struct smpro_error_hdr *err_info; +	s32 err_count; +	int ret; + +	err_info = &smpro_error_table[channel]; + +	ret = regmap_read(errmon->regmap, err_info->count, &err_count); +	if (ret) +		return ret; + +	/* Bit 8 indicates the overflow status */ +	return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0); +} + +static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da, +				     char *buf, int channel) +{ +	struct smpro_errmon *errmon = dev_get_drvdata(dev); +	unsigned char err_data[MAX_READ_BLOCK_LENGTH]; +	struct smpro_error_hdr *err_info; +	s32 err_count, err_length; +	int ret; + +	err_info = &smpro_error_table[channel]; + +	ret = regmap_read(errmon->regmap, err_info->count, &err_count); +	/* Error count is the low byte */ +	err_count &= 0xff; +	if (ret || !err_count || err_count > err_info->max_cnt) +		return ret; + +	ret = regmap_read(errmon->regmap, err_info->len, &err_length); +	if (ret || err_length <= 0) +		return ret; + +	if (err_length > MAX_READ_BLOCK_LENGTH) +		err_length = MAX_READ_BLOCK_LENGTH; + +	memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH); +	ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length); +	if (ret < 0) +		return ret; + +	/* clear the error */ +	ret = regmap_write(errmon->regmap, err_info->count, 0x100); +	if (ret) +		return ret; +	/* +	 * The output of Core/Memory/PCIe/Others UE/CE errors follows the format +	 * specified in section 5.8.1 CE/UE Error Data record in +	 * Altra SOC BMC Interface specification. +	 */ +	return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data); +} + +/* + * Output format: + * <4-byte hex value of error info><4-byte hex value of error extensive data> + * Where: + *   + error info : The error information + *   + error data : Extensive data (32 bits) + * Reference to section 5.10 RAS Internal Error Register Definition in + * Altra SOC BMC Interface specification + */ +static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da, +				       char *buf, int channel) +{ +	struct smpro_errmon *errmon = dev_get_drvdata(dev); +	struct smpro_int_error_hdr *err_info; +	unsigned int err[4] = { 0 }; +	unsigned int err_type; +	unsigned int val; +	int ret; + +	/* read error status */ +	ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); +	if (ret) +		return ret; + +	if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || +	    (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) +		return 0; + +	err_info = &list_smpro_int_error_hdr[channel]; +	ret = regmap_read(errmon->regmap, err_info->type, &val); +	if (ret) +		return ret; + +	err_type = (val & BIT(1)) ? BIT(1) : +		   (val & BIT(2)) ? BIT(2) : 0; + +	if (!err_type) +		return 0; + +	ret = regmap_read(errmon->regmap, err_info->info_l, err + 1); +	if (ret) +		return ret; + +	ret = regmap_read(errmon->regmap, err_info->info_h, err); +	if (ret) +		return ret; + +	if (err_type & BIT(2)) { +		/* Error with data type */ +		ret = regmap_read(errmon->regmap, err_info->data_l, err + 3); +		if (ret) +			return ret; + +		ret = regmap_read(errmon->regmap, err_info->data_h, err + 2); +		if (ret) +			return ret; +	} + +	/* clear the read errors */ +	ret = regmap_write(errmon->regmap, err_info->type, err_type); +	if (ret) +		return ret; + +	return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err); +} + +/* + * Output format: + * <4-byte hex value of warining info> + * Reference to section 5.10 RAS Internal Error Register Definition in + * Altra SOC BMC Interface specification + */ +static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da, +					char *buf, int channel) +{ +	struct smpro_errmon *errmon = dev_get_drvdata(dev); +	struct smpro_int_error_hdr *err_info; +	unsigned int warn[2] = { 0 }; +	unsigned int val; +	int ret; + +	/* read error status */ +	ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); +	if (ret) +		return ret; + +	if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || +	    (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) +		return 0; + +	err_info = &list_smpro_int_error_hdr[channel]; +	ret = regmap_read(errmon->regmap, err_info->type, &val); +	if (ret) +		return ret; + +	if (!(val & BIT(0))) +		return 0; + +	ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1); +	if (ret) +		return ret; + +	ret = regmap_read(errmon->regmap, err_info->warn_h, warn); +	if (ret) +		return ret; + +	/* clear the warning */ +	ret = regmap_write(errmon->regmap, err_info->type, BIT(0)); +	if (ret) +		return ret; + +	return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn); +} + +#define ERROR_OVERFLOW_RO(_error, _index) \ +	static ssize_t overflow_##_error##_show(struct device *dev,            \ +						struct device_attribute *da,   \ +						char *buf)                     \ +	{                                                                      \ +		return smpro_overflow_data_read(dev, da, buf, _index);         \ +	}                                                                      \ +	static DEVICE_ATTR_RO(overflow_##_error) + +ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR); +ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR); +ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR); +ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR); +ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR); +ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR); +ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR); +ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR); + +#define ERROR_RO(_error, _index) \ +	static ssize_t error_##_error##_show(struct device *dev,            \ +					     struct device_attribute *da,   \ +					     char *buf)                     \ +	{                                                                   \ +		return smpro_error_data_read(dev, da, buf, _index);         \ +	}                                                                   \ +	static DEVICE_ATTR_RO(error_##_error) + +ERROR_RO(core_ce, CORE_CE_ERR); +ERROR_RO(core_ue, CORE_UE_ERR); +ERROR_RO(mem_ce, MEM_CE_ERR); +ERROR_RO(mem_ue, MEM_UE_ERR); +ERROR_RO(pcie_ce, PCIE_CE_ERR); +ERROR_RO(pcie_ue, PCIE_UE_ERR); +ERROR_RO(other_ce, OTHER_CE_ERR); +ERROR_RO(other_ue, OTHER_UE_ERR); + +static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ +	return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR); +} +static DEVICE_ATTR_RO(error_smpro); + +static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ +	return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR); +} +static DEVICE_ATTR_RO(error_pmpro); + +static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ +	return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR); +} +static DEVICE_ATTR_RO(warn_smpro); + +static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ +	return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR); +} +static DEVICE_ATTR_RO(warn_pmpro); + +#define EVENT_RO(_event, _index) \ +	static ssize_t event_##_event##_show(struct device *dev,            \ +					     struct device_attribute *da,   \ +					     char *buf)                     \ +	{                                                                   \ +		return smpro_event_data_read(dev, da, buf, _index);         \ +	}                                                                   \ +	static DEVICE_ATTR_RO(event_##_event) + +EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT); +EVENT_RO(vrd_hot, VRD_HOT_EVENT); +EVENT_RO(dimm_hot, DIMM_HOT_EVENT); + +static struct attribute *smpro_errmon_attrs[] = { +	&dev_attr_overflow_core_ce.attr, +	&dev_attr_overflow_core_ue.attr, +	&dev_attr_overflow_mem_ce.attr, +	&dev_attr_overflow_mem_ue.attr, +	&dev_attr_overflow_pcie_ce.attr, +	&dev_attr_overflow_pcie_ue.attr, +	&dev_attr_overflow_other_ce.attr, +	&dev_attr_overflow_other_ue.attr, +	&dev_attr_error_core_ce.attr, +	&dev_attr_error_core_ue.attr, +	&dev_attr_error_mem_ce.attr, +	&dev_attr_error_mem_ue.attr, +	&dev_attr_error_pcie_ce.attr, +	&dev_attr_error_pcie_ue.attr, +	&dev_attr_error_other_ce.attr, +	&dev_attr_error_other_ue.attr, +	&dev_attr_error_smpro.attr, +	&dev_attr_error_pmpro.attr, +	&dev_attr_warn_smpro.attr, +	&dev_attr_warn_pmpro.attr, +	&dev_attr_event_vrd_warn_fault.attr, +	&dev_attr_event_vrd_hot.attr, +	&dev_attr_event_dimm_hot.attr, +	NULL +}; + +ATTRIBUTE_GROUPS(smpro_errmon); + +static int smpro_errmon_probe(struct platform_device *pdev) +{ +	struct smpro_errmon *errmon; + +	errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL); +	if (!errmon) +		return -ENOMEM; + +	platform_set_drvdata(pdev, errmon); + +	errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL); +	if (!errmon->regmap) +		return -ENODEV; + +	return 0; +} + +static struct platform_driver smpro_errmon_driver = { +	.probe          = smpro_errmon_probe, +	.driver = { +		.name   = "smpro-errmon", +		.dev_groups = smpro_errmon_groups, +	}, +}; + +module_platform_driver(smpro_errmon_driver); + +MODULE_AUTHOR("Tung Nguyen <tung.nguyen@amperecomputing.com>"); +MODULE_AUTHOR("Thinh Pham <thinh.pham@amperecomputing.com>"); +MODULE_AUTHOR("Hoang Nguyen <hnguyen@amperecomputing.com>"); +MODULE_AUTHOR("Thu Nguyen <thu@os.amperecomputing.com>"); +MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>"); +MODULE_DESCRIPTION("Ampere Altra SMpro driver"); +MODULE_LICENSE("GPL");  | 
