diff options
author | Vignesh Chander <Vignesh.Chander@amd.com> | 2024-06-24 16:44:26 -0500 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-06-27 17:31:37 -0400 |
commit | cbda2758d8bfae323b846210a3e52f0ad5fe7164 (patch) | |
tree | 034e4d34e668d0dd014fe139efa6e9736b7b28e4 /drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | |
parent | 78146c1dcd220ae98fd5f4114f992299fc5ee161 (diff) |
drm/amdgpu: process RAS fatal error MB notification
For RAS error scenario, VF guest driver will check mailbox
and set fed flag to avoid unnecessary HW accesses.
additionally, poll for reset completion message first
to avoid accidentally spamming multiple reset requests to host.
v2: add another mailbox check for handling case where kfd detects
timeout first
v3: set host_flr bit and use wait_for_reset
Signed-off-by: Vignesh Chander <Vignesh.Chander@amd.com>
Reviewed-by: Zhigang Luo <Zhigang.Luo@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 25 |
1 files changed, 22 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 63f2286858c4..ccb3d041c2b2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -229,6 +229,22 @@ void amdgpu_virt_free_mm_table(struct amdgpu_device *adev) adev->virt.mm_table.gpu_addr = 0; } +/** + * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt + * @adev: amdgpu device. + * Check whether host sent RAS error message + * Return: true if found, otherwise false + */ +bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev) +{ + struct amdgpu_virt *virt = &adev->virt; + + if (!virt->ops || !virt->ops->rcvd_ras_intr) + return false; + + return virt->ops->rcvd_ras_intr(adev); +} + unsigned int amd_sriov_msg_checksum(void *obj, unsigned long obj_size, @@ -612,11 +628,14 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) ret = amdgpu_virt_read_pf2vf_data(adev); if (ret) { adev->virt.vf2pf_update_retry_cnt++; - if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && - amdgpu_sriov_runtime(adev)) { + + if ((amdgpu_virt_rcvd_ras_interrupt(adev) || + adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && + amdgpu_sriov_runtime(adev)) { + amdgpu_ras_set_fed(adev, true); if (amdgpu_reset_domain_schedule(adev->reset_domain, - &adev->kfd.reset_work)) + &adev->kfd.reset_work)) return; else dev_err(adev->dev, "Failed to queue work! at %s", __func__); |