diff options
author | Hawking Zhang <Hawking.Zhang@amd.com> | 2024-05-30 16:02:12 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-06-05 10:58:26 -0400 |
commit | a474161e84fc0b15534a80f8dfcbaf5e48fd8249 (patch) | |
tree | ea4bebcef9600260fa44eb21b17db1f1297355f2 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 2bac084468847cfe5bbc7166082b2a208514bb1c (diff) |
drm/amdgpu: Update programming for boot error reporting
AMDGPU_RAS_GPU_ERR_BOOT_STATUS field is no longer valid.
The polling sequence is also simplifed according to
the latest firmware change.
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 99 |
1 files changed, 45 insertions, 54 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 45b696524541..8dbfdb767f94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -4400,64 +4400,74 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, #define mmMP0_SMN_C2PMSG_92 0x1609C #define mmMP0_SMN_C2PMSG_126 0x160BE static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, - u32 instance, u32 boot_error) + u32 instance) { u32 socket_id, aid_id, hbm_id; - u32 reg_data; + u32 fw_status; + u32 boot_error; u64 reg_addr; - socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); - aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); - hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); - /* The pattern for smn addressing in other SOC could be different from * the one for aqua_vanjaram. We should revisit the code if the pattern * is changed. In such case, replace the aqua_vanjaram implementation * with more common helper */ reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + aqua_vanjaram_encode_ext_smn_addressing(instance); + fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + + aqua_vanjaram_encode_ext_smn_addressing(instance); + boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr); - reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); - dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", - socket_id, aid_id, reg_data); + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", - socket_id, aid_id, hbm_id); + dev_info(adev->dev, + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n", + socket_id, aid_id, hbm_id, fw_status); if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", - socket_id, aid_id, hbm_id); + dev_info(adev->dev, + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n", + socket_id, aid_id, hbm_id, fw_status); if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", - socket_id, aid_id, hbm_id); + dev_info(adev->dev, + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n", + socket_id, aid_id, hbm_id, fw_status); } -static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, - u32 instance, u32 *boot_error) +static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, + u32 instance) { - u32 reg_addr; + u64 reg_addr; u32 reg_data; int retry_loop; @@ -4466,41 +4476,22 @@ static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); - if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) { - *boot_error = AMDGPU_RAS_BOOT_SUCEESS; - return 0; - } - msleep(1); - } - - /* The pattern for smn addressing in other SOC could be different from - * the one for aqua_vanjaram. We should revisit the code if the pattern - * is changed. In such case, replace the aqua_vanjaram implementation - * with more common helper */ - reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + - aqua_vanjaram_encode_ext_smn_addressing(instance); - - for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { - reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); - if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) { - *boot_error = reg_data; - return 0; - } - msleep(1); + if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) + return false; + else + msleep(1); } - *boot_error = reg_data; - return -ETIME; + return true; } void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances) { - u32 boot_error = 0; u32 i; for (i = 0; i < num_instances; i++) { - if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error)) - amdgpu_ras_boot_time_error_reporting(adev, i, boot_error); + if (amdgpu_ras_boot_error_detected(adev, i)) + amdgpu_ras_boot_time_error_reporting(adev, i); } } |