diff options
author | YiPeng Chai <YiPeng.Chai@amd.com> | 2023-12-12 17:23:23 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2023-12-19 14:59:03 -0500 |
commit | 99cab331a4ee621e3604542ca88f9d76f2865aef (patch) | |
tree | cff02d6e104ba41b92f96be5a4bc35546ab6e4d4 /drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | |
parent | 6fe08f56db798659beca41ab5b1727a31518f794 (diff) |
drm/amdgpu: Add umc page retirement for umc v12_0
Add umc page retirement for umc v12_0.
V2:
1. Changed umc page retirement check condition
to call umc_v12_0_is_uncorrectable_error.
2. Use memset to clear the contents of the umc
error address structure.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/umc_v12_0.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 8430888760ba..7458a218e89d 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -26,6 +26,7 @@ #include "amdgpu.h" #include "umc/umc_12_0_0_offset.h" #include "umc/umc_12_0_0_sh_mask.h" +#include "mp/mp_13_0_6_sh_mask.h" const uint32_t umc_v12_0_channel_idx_tbl[] @@ -370,6 +371,59 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev, return 0; } +static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + amdgpu_mca_smu_log_ras_error(adev, + AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status); + amdgpu_mca_smu_log_ras_error(adev, + AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status); +} + +static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev, + void *ras_error_status) +{ + struct ras_err_node *err_node; + uint64_t mc_umc_status; + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + + for_each_ras_error(err_node, err_data) { + mc_umc_status = err_node->err_info.err_addr.err_status; + if (!mc_umc_status) + continue; + + if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) { + uint64_t mca_addr, err_addr, mca_ipid; + uint32_t InstanceIdLo; + struct amdgpu_smuio_mcm_config_info *mcm_info; + + mcm_info = &err_node->err_info.mcm_info; + mca_addr = err_node->err_info.err_addr.err_addr; + mca_ipid = err_node->err_info.err_addr.err_ipid; + + err_addr = REG_GET_FIELD(mca_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo); + + dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n", + mca_ipid, + mcm_info->die_id, + MCA_IPID_LO_2_UMC_INST(InstanceIdLo), + MCA_IPID_LO_2_UMC_CH(InstanceIdLo), + err_addr); + + umc_v12_0_convert_error_address(adev, + err_data, err_addr, + MCA_IPID_LO_2_UMC_CH(InstanceIdLo), + MCA_IPID_LO_2_UMC_INST(InstanceIdLo), + mcm_info->die_id); + + /* Clear umc error address content */ + memset(&err_node->err_info.err_addr, + 0, sizeof(err_node->err_info.err_addr)); + } + } +} + static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev) { amdgpu_umc_loop_channels(adev, @@ -396,4 +450,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = { }, .err_cnt_init = umc_v12_0_err_cnt_init, .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, + .ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count, + .ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address, }; |