diff options
author | Yang Wang <kevinyang.wang@amd.com> | 2023-09-25 19:30:26 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2023-10-13 11:35:35 -0400 |
commit | 5b1270beb3801d328b43577a8bb1152d435bb146 (patch) | |
tree | d4e881f7a85912a98ce06a7d73e95b186b239358 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | |
parent | 6a1c31c7a88d8fd32e10a875b76922e5d175428f (diff) |
drm/amdgpu: add ras_err_info to identify RAS error source
introduced "ras_err_info" to better identify a RAS ERROR source.
NOTE:
For legacy chips, keep the original RAS error print format.
v1:
RAS errors may come from different dies during a RAS error query,
therefore, need a new data structure to identify the source of RAS ERROR.
v2:
- use new data structure 'amdgpu_smuio_mcm_config_info' instead of
ras_err_id (in v1 patch)
- refine ras error dump function name
- refine ras error dump log format
Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 46b0dcf846dc..0a5c8a107fb2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -28,6 +28,7 @@ #include <linux/list.h> #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" +#include "amdgpu_smuio.h" struct amdgpu_iv_entry; @@ -443,13 +444,29 @@ struct ras_fs_data { char debugfs_name[32]; }; +struct ras_err_info { + struct amdgpu_smuio_mcm_config_info mcm_info; + u64 ce_count; + u64 ue_count; +}; + +struct ras_err_node { + struct list_head node; + struct ras_err_info err_info; +}; + struct ras_err_data { unsigned long ue_count; unsigned long ce_count; unsigned long err_addr_cnt; struct eeprom_table_record *err_addr; + u32 err_list_count; + struct list_head err_node_list; }; +#define for_each_ras_error(err_node, err_data) \ + list_for_each_entry(err_node, &(err_data)->err_node_list, node) + struct ras_err_handler_data { /* point to bad page records array */ struct eeprom_table_record *bps; @@ -773,4 +790,12 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, const struct amdgpu_ras_err_status_reg_entry *reg_list, uint32_t reg_list_size, uint32_t instance); + +int amdgpu_ras_error_data_init(struct ras_err_data *err_data); +void amdgpu_ras_error_data_fini(struct ras_err_data *err_data); +int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count); +int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count); + #endif |