diff options
author | Hawking Zhang <Hawking.Zhang@amd.com> | 2024-01-02 19:32:56 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-01-15 18:35:35 -0500 |
commit | cce4febb274cc16655dd4692fbcebc6c7d7953b5 (patch) | |
tree | d4532e9cd25dc91b56e783007d83861a898758de | |
parent | f5e4cc8461c408dcb679bb1f7c3bd8a586406709 (diff) |
drm/amdgpu: Add ras helper to query boot errors v2
Add ras helper function to query boot time gpu
errors.
v2: use aqua_vanjaram smn addressing pattern
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Le Ma <le.ma@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 |
3 files changed, 110 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 031dba7582c5..00bf4a364a92 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1333,6 +1333,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev); #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) +#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l)) /* * BIOS helpers. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 31823a30dea2..a86f91838301 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3767,3 +3767,98 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, return 0; } + +#define mmMP0_SMN_C2PMSG_92 0x1609C +#define mmMP0_SMN_C2PMSG_126 0x160BE +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, + u32 instance, u32 boot_error) +{ + u32 socket_id, aid_id, hbm_id; + u32 reg_data; + u64 reg_addr; + + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); + + /* The pattern for smn addressing in other SOC could be different from + * the one for aqua_vanjaram. We should revisit the code if the pattern + * is changed. In such case, replace the aqua_vanjaram implementation + * with more common helper */ + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + + aqua_vanjaram_encode_ext_smn_addressing(instance); + + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", + socket_id, aid_id, reg_data); + + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", + socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", + socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", + socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", + socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", + socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", + socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", + socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", + socket_id, aid_id, hbm_id); +} + +static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, + u32 instance, u32 *boot_error) +{ + u32 reg_addr; + u32 reg_data; + int retry_loop; + + /* The pattern for smn addressing in other SOC could be different from + * the one for aqua_vanjaram. We should revisit the code if the pattern + * is changed. In such case, replace the aqua_vanjaram implementation + * with more common helper */ + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + + aqua_vanjaram_encode_ext_smn_addressing(instance); + + for (retry_loop = 0; retry_loop < 1000; retry_loop++) { + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) { + *boot_error = reg_data; + return 0; + } + msleep(1); + } + + *boot_error = reg_data; + return -ETIME; +} + +void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances) +{ + u32 boot_error = 0; + u32 i; + + for (i = 0; i < num_instances; i++) { + if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error)) + amdgpu_ras_boot_time_error_reporting(adev, i, boot_error); + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index c36faf353b46..80e574475aa3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -33,6 +33,19 @@ struct amdgpu_iv_entry; +#define AMDGPU_RAS_GPU_ERR_MEM_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 0, 0) +#define AMDGPU_RAS_GPU_ERR_FW_LOAD(x) AMDGPU_GET_REG_FIELD(x, 1, 1) +#define AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 2, 2) +#define AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 3, 3) +#define AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 4, 4) +#define AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 5, 5) +#define AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(x) AMDGPU_GET_REG_FIELD(x, 6, 6) +#define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x) AMDGPU_GET_REG_FIELD(x, 7, 7) +#define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8) +#define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11) +#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 13, 13) +#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) AMDGPU_GET_REG_FIELD(x, 31, 31) + #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) /* position of instance value in sub_block_index of * ta_ras_trigger_error_input, the sub block uses lower 12 bits @@ -819,5 +832,5 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, struct amdgpu_smuio_mcm_config_info *mcm_info, struct ras_err_addr *err_addr, u64 count); - +void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances); #endif |