summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
diff options
context:
space:
mode:
authorKent Russell <kent.russell@amd.com>2021-10-19 10:05:07 -0400
committerAlex Deucher <alexander.deucher@amd.com>2021-10-28 14:26:12 -0400
commit68daadf3d673568bb7122b1683fd8b0e27c55d9b (patch)
tree2f561e2389f17379a788c4035df738417098e207 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
parent8483fdfea778aedded76c74659692dee3756b12b (diff)
drm/amdgpu: Add kernel parameter support for ignoring bad page threshold
When a GPU hits the bad_page_threshold, it will not be initialized by the amdgpu driver. This means that the table cannot be cleared, nor can information gathering be performed (getting serial number, BDF, etc). If the bad_page_threshold kernel parameter is set to -2, continue to initialize the GPU, while printing a warning to dmesg that this action has been done v2: squash in Luben's fix to restore RAS info reporting Cc: Luben Tuikov <luben.tuikov@amd.com> Cc: Mukul Joshi <Mukul.Joshi@amd.com> Signed-off-by: Kent Russell <kent.russell@amd.com> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Luben Tuikov <luben.tuikov@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c15
1 files changed, 11 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 3978152e5ccc..05117eda105b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1105,11 +1105,18 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
res = amdgpu_ras_eeprom_correct_header_tag(control,
RAS_TABLE_HDR_VAL);
} else {
- *exceed_err_limit = true;
- dev_err(adev->dev,
- "RAS records:%d exceed threshold:%d, "
- "GPU will not be initialized. Replace this GPU or increase the threshold",
+ dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
control->ras_num_recs, ras->bad_page_cnt_threshold);
+ if (amdgpu_bad_page_threshold == -2) {
+ dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -2.");
+ res = 0;
+ } else {
+ *exceed_err_limit = true;
+ dev_err(adev->dev,
+ "RAS records:%d exceed threshold:%d, "
+ "GPU will not be initialized. Replace this GPU or increase the threshold",
+ control->ras_num_recs, ras->bad_page_cnt_threshold);
+ }
}
} else {
DRM_INFO("Creating a new EEPROM table");