diff options
author | Asad Kamal <asad.kamal@amd.com> | 2023-10-05 15:40:42 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2023-10-19 18:26:51 -0400 |
commit | 53dd920c1f471a5763c660a7b94fe0aaf746d357 (patch) | |
tree | 67ee44f4003a8a8bb1e8812fb2c03c2df4a6fe3e /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 16fb2a41e64e3133e9457c85490f6ee36c2ffaaf (diff) |
drm/amdgpu : Add hive ras recovery check
If one of the devices in the hive detects a
fatal error, need to send ras recovery reset
message to PMFW of all devices in the hive.
For that add a flag in hive to indicate that
it's undergoing ras recovery
Signed-off-by: Asad Kamal <asad.kamal@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 973073e07b2a..c7f8dcb3b4d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2140,9 +2140,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + if (hive) + atomic_set(&hive->ras_recovery, 1); if (!ras->disable_ras_err_cnt_harvest) { - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { @@ -2159,7 +2161,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_ras_log_on_err_counter(remote_adev); } - amdgpu_put_xgmi_hive(hive); } if (amdgpu_device_should_recover_gpu(ras->adev)) { @@ -2194,6 +2195,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } atomic_set(&ras->in_recovery, 0); + if (hive) { + atomic_set(&hive->ras_recovery, 0); + amdgpu_put_xgmi_hive(hive); + } } /* alloc/realloc bps array */ |