summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c116
1 files changed, 96 insertions, 20 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bc459dff42f3..3b1c6b77152e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1036,7 +1036,8 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct ras_manager *ras_mgr,
struct ras_err_data *err_data,
const char *blk_name,
- bool is_ue)
+ bool is_ue,
+ bool is_de)
{
struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node;
@@ -1065,25 +1066,50 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
}
} else {
- for_each_ras_error(err_node, err_data) {
- err_info = &err_node->err_info;
- mcm_info = &err_info->mcm_info;
- if (err_info->ce_count) {
+ if (is_de) {
+ for_each_ras_error(err_node, err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ if (err_info->de_count) {
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld new deferred hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->de_count,
+ blk_name);
+ }
+ }
+
+ for_each_ras_error(err_node, &ras_mgr->err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld new correctable hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->ce_count,
- blk_name);
+ "%lld deferred hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id,
+ err_info->de_count, blk_name);
+ }
+ } else {
+ for_each_ras_error(err_node, err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ if (err_info->ce_count) {
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld new correctable hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ce_count,
+ blk_name);
+ }
}
- }
- for_each_ras_error(err_node, &ras_mgr->err_data) {
- err_info = &err_node->err_info;
- mcm_info = &err_info->mcm_info;
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld correctable hardware errors detected in total in %s block\n",
- mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name);
+ for_each_ras_error(err_node, &ras_mgr->err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld correctable hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id,
+ err_info->ce_count, blk_name);
+ }
}
}
}
@@ -1102,7 +1128,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
if (err_data->ce_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, false);
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ blk_name, false, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
@@ -1124,7 +1151,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
if (err_data->ue_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, true);
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ blk_name, true, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
@@ -1144,6 +1172,28 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
}
}
+ if (err_data->de_count) {
+ if (err_data_has_source_info(err_data)) {
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ blk_name, false, true);
+ } else if (!adev->aid_mask &&
+ adev->smuio.funcs &&
+ adev->smuio.funcs->get_socket_id &&
+ adev->smuio.funcs->get_die_id) {
+ dev_info(adev->dev, "socket: %d, die: %d "
+ "%ld deferred hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.de_count,
+ blk_name);
+ } else {
+ dev_info(adev->dev, "%ld deferred hardware errors "
+ "detected in %s block\n",
+ ras_mgr->err_data.de_count,
+ blk_name);
+ }
+ }
}
static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data)
@@ -1154,7 +1204,8 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
if (err_data_has_source_info(err_data)) {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
-
+ amdgpu_ras_error_statistic_de_count(&obj->err_data,
+ &err_info->mcm_info, NULL, err_info->de_count);
amdgpu_ras_error_statistic_ce_count(&obj->err_data,
&err_info->mcm_info, NULL, err_info->ce_count);
amdgpu_ras_error_statistic_ue_count(&obj->err_data,
@@ -1164,6 +1215,7 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
/* for legacy asic path which doesn't has error source info */
obj->err_data.ue_count += err_data->ue_count;
obj->err_data.ce_count += err_data->ce_count;
+ obj->err_data.de_count += err_data->de_count;
}
}
@@ -1312,6 +1364,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
info->ue_count = obj->err_data.ue_count;
info->ce_count = obj->err_data.ce_count;
+ info->de_count = obj->err_data.de_count;
amdgpu_ras_error_generate_report(adev, info, &err_data);
@@ -2029,6 +2082,7 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
*/
obj->err_data.ue_count += err_data.ue_count;
obj->err_data.ce_count += err_data.ce_count;
+ obj->err_data.de_count += err_data.de_count;
}
amdgpu_ras_error_data_fini(&err_data);
@@ -3872,6 +3926,28 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
return 0;
}
+int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ struct ras_err_addr *err_addr, u64 count)
+{
+ struct ras_err_info *err_info;
+
+ if (!err_data || !mcm_info)
+ return -EINVAL;
+
+ if (!count)
+ return 0;
+
+ err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+ if (!err_info)
+ return -EINVAL;
+
+ err_info->de_count += count;
+ err_data->de_count += count;
+
+ return 0;
+}
+
#define mmMP0_SMN_C2PMSG_92 0x1609C
#define mmMP0_SMN_C2PMSG_126 0x160BE
static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,