summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c20
1 files changed, 17 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 905c5ab486a1..d01d7969d47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4125,8 +4125,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
amdgpu_fbdev_set_suspend(tmp_adev, 0);
- /* must succeed. */
- amdgpu_ras_resume(tmp_adev);
+ /*
+ * The GPU enters bad state once faulty pages
+ * by ECC has reached the threshold, and ras
+ * recovery is scheduled next. So add one check
+ * here to break recovery if it indeed exceeds
+ * bad page threshold, and remind user to
+ * retire this GPU or setting one bigger
+ * bad_page_threshold value to fix this once
+ * probing driver again.
+ */
+ if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+ /* must succeed. */
+ amdgpu_ras_resume(tmp_adev);
+ } else {
+ r = -EINVAL;
+ goto out;
+ }
/* Update PSP FW topology after reset */
if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
@@ -4134,7 +4149,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
}
}
-
out:
if (!r) {
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);