drm/amdgpu: revert "fix system hang issue during GPU reset"

The whole approach wasn't thought through till the end. We already had a reset lock like this in the past and it caused the same problems like this one. Completely revert the patch for now and add individual trylock protection to the hardware access functions as necessary. This reverts commit df9c8d1aa278c435c30a69b8f2418b4a52fcb929. Signed-off-by: Christian König <christian.koenig@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Christian König <christian.koenig@amd.com> 2020-08-12 17:48:26 +0200
committer: Alex Deucher <alexander.deucher@amd.com> 2020-08-14 16:22:40 -0400
commit: f1403342ebdfcff3c3cf57ae476f19d3078f2767 (patch)
tree: d94e6a6c652ebc0688fdb0c57587712c75970472 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent: 05f39286ce11b98376e0d179aff0d537c257e772 (diff)
1 files changed, 24 insertions, 33 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bb7f0c8611f9..415e1a32b98c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
 			if (adev->ip_blocks[i].status.hw == true)
 				break;
 
-			if (amdgpu_in_reset(adev) || adev->in_suspend) {
+			if (adev->in_gpu_reset || adev->in_suspend) {
 				r = adev->ip_blocks[i].version->funcs->resume(adev);
 				if (r) {
 					DRM_ERROR("resume of IP block <%s> failed %d\n",
@@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
 			AMDGPU_RESET_MAGIC_NUM))
 		return true;
 
-	if (!amdgpu_in_reset(adev))
+	if (!adev->in_gpu_reset)
 		return false;
 
 	/*
@@ -3053,8 +3053,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	mutex_init(&adev->mn_lock);
 	mutex_init(&adev->virt.vf_errors.lock);
 	hash_init(adev->mn_hash);
-	init_rwsem(&adev->reset_sem);
-	atomic_set(&adev->in_gpu_reset, 0);
+	mutex_init(&adev->lock_reset);
 	mutex_init(&adev->psp.mutex);
 	mutex_init(&adev->notifier_lock);
 
@@ -4082,11 +4081,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
-			if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) {
-				dev_warn(tmp_adev->dev, "asic atom init failed!");
-				r = -EAGAIN;
-				goto out;
-			}
+			if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
+				DRM_WARN("asic atom init failed!");
 
 			if (!r) {
 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
@@ -4176,18 +4172,16 @@ end:
 	return r;
 }
 
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
+static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
 {
-	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
-		return false;
-
-	if (hive) {
-		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
-	} else {
-		down_write(&adev->reset_sem);
-	}
+	if (trylock) {
+		if (!mutex_trylock(&adev->lock_reset))
+			return false;
+	} else
+		mutex_lock(&adev->lock_reset);
 
 	atomic_inc(&adev->gpu_reset_counter);
+	adev->in_gpu_reset = true;
 	switch (amdgpu_asic_reset_method(adev)) {
 	case AMD_RESET_METHOD_MODE1:
 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -4207,8 +4201,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 {
 	amdgpu_vf_error_trans_all(adev);
 	adev->mp1_state = PP_MP1_STATE_NONE;
-	atomic_set(&adev->in_gpu_reset, 0);
-	up_write(&adev->reset_sem);
+	adev->in_gpu_reset = false;
+	mutex_unlock(&adev->lock_reset);
 }
 
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@@ -4318,14 +4312,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	 * We always reset all schedulers for device and all devices for XGMI
 	 * hive so that should take care of them too.
 	 */
-	hive = amdgpu_get_xgmi_hive(adev, false);
-	if (hive) {
-		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
-			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
-				job ? job->base.id : -1, hive->hive_id);
-			return 0;
-		}
-		mutex_lock(&hive->hive_lock);
+	hive = amdgpu_get_xgmi_hive(adev, true);
+	if (hive && !mutex_trylock(&hive->reset_lock)) {
+		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
+			  job ? job->base.id : -1, hive->hive_id);
+		mutex_unlock(&hive->hive_lock);
+		return 0;
 	}
 
 	/*
@@ -4347,11 +4339,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 	/* block all schedulers and reset given job's ring */
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
+		if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
 			DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
 				  job ? job->base.id : -1);
-			r = 0;
-			goto skip_recovery;
+			mutex_unlock(&hive->hive_lock);
+			return 0;
 		}
 
 		/*
@@ -4484,9 +4476,8 @@ skip_sched_resume:
 		amdgpu_device_unlock_adev(tmp_adev);
 	}
 
-skip_recovery:
 	if (hive) {
-		atomic_set(&hive->in_reset, 0);
+		mutex_unlock(&hive->reset_lock);
 		mutex_unlock(&hive->hive_lock);
 	}
author	Christian König <christian.koenig@amd.com>	2020-08-12 17:48:26 +0200
committer	Alex Deucher <alexander.deucher@amd.com>	2020-08-14 16:22:40 -0400
commit	f1403342ebdfcff3c3cf57ae476f19d3078f2767 (patch)
tree	d94e6a6c652ebc0688fdb0c57587712c75970472 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent	05f39286ce11b98376e0d179aff0d537c257e772 (diff)