drm/amdgpu: fix sriov host flr handler

We send back the ready to reset message before we stop anything. This is wrong. Move it to when we are actually ready for the FLR to happen. In the current state since we take tens of seconds to stop everything, it is very likely that host would give up waiting and reset the GPU before we send ready, so it would be the same as before. But this gets rid of the hack with reset_domain locking and also let us tell how slow ready to reset actually is from the host. The ready to reset speed can be improved later. Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Reviewed-by: Emily Deng <Emily.Deng@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Yunxiang Li <Yunxiang.Li@amd.com> 2024-05-24 16:22:28 -0400
committer: Alex Deucher <alexander.deucher@amd.com> 2024-06-14 16:15:58 -0400
commit: 5c0a1cdd17ce9eb315102c65084af899622ed268 (patch)
tree: 16f9220f70bb62e86d5c1f74e634521c03e8b7dc /drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
parent: b3948ad1ac582f560e1f3aeaecf384619921c48d (diff)
1 files changed, 16 insertions, 23 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f4c47492e0cd..6b71ee85ee65 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -249,38 +249,30 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
-static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
+static void xgpu_ai_ready_to_reset(struct amdgpu_device *adev)
 {
-	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
-	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
-	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
-
-	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
-	 * otherwise the mailbox msg will be ruined/reseted by
-	 * the VF FLR.
-	 */
-	if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0)
-		return;
-
-	down_write(&adev->reset_domain->sem);
-
-	amdgpu_virt_fini_data_exchange(adev);
-
 	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
+}
 
+static int xgpu_ai_wait_reset(struct amdgpu_device *adev)
+{
+	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
 	do {
 		if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
-			goto flr_done;
-
+			return 0;
 		msleep(10);
 		timeout -= 10;
 	} while (timeout > 1);
-
 	dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
+	return -ETIME;
+}
 
-flr_done:
-	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
-	up_write(&adev->reset_domain->sem);
+static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
+{
+	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
+	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+
+	amdgpu_virt_fini_data_exchange(adev);
 
 	/* Trigger recovery for world switch failure if no TDR */
 	if (amdgpu_device_should_recover_gpu(adev)
@@ -417,7 +409,8 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
 	.req_full_gpu	= xgpu_ai_request_full_gpu_access,
 	.rel_full_gpu	= xgpu_ai_release_full_gpu_access,
 	.reset_gpu = xgpu_ai_request_reset,
-	.wait_reset = NULL,
+	.ready_to_reset = xgpu_ai_ready_to_reset,
+	.wait_reset = xgpu_ai_wait_reset,
 	.trans_msg = xgpu_ai_mailbox_trans_msg,
 	.req_init_data  = xgpu_ai_request_init_data,
 	.ras_poison_handler = xgpu_ai_ras_poison_handler,
author	Yunxiang Li <Yunxiang.Li@amd.com>	2024-05-24 16:22:28 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	2024-06-14 16:15:58 -0400
commit	5c0a1cdd17ce9eb315102c65084af899622ed268 (patch)
tree	16f9220f70bb62e86d5c1f74e634521c03e8b7dc /drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
parent	b3948ad1ac582f560e1f3aeaecf384619921c48d (diff)