summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
diff options
context:
space:
mode:
authorAndrey Grodzovsky <andrey.grodzovsky@amd.com>2022-01-21 17:23:32 -0500
committerAndrey Grodzovsky <andrey.grodzovsky@amd.com>2022-02-09 12:17:09 -0500
commitcfbb6b0047448e2d986160d9f30d60f604d9ad0f (patch)
treefdb18fa843a69019c49e92887cee48034d6b2e5b /drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
parentf287a3c5b03f51efa8d8f3e141a79177f91047e0 (diff)
drm/amdgpu: Rework reset domain to be refcounted.
The reset domain contains register access semaphor now and so needs to be present as long as each device in a hive needs it and so it cannot be binded to XGMI hive life cycle. Adress this by making reset domain refcounted and pointed by each member of the hive and the hive itself. v4: Fix crash on boot witrh XGMI hive by adding type to reset_domain. XGMI will only create a new reset_domain if prevoius was of single device type meaning it's first boot. Otherwsie it will take a refocunt to exsiting reset_domain from the amdgou device. Add a wrapper around reset_domain->refcount get/put and a wrapper around send to reset wq (Lijo) Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Link: https://www.spinics.net/lists/amd-gfx/msg74121.html
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c29
1 files changed, 23 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 89b682afe821..eb06322d2972 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -32,6 +32,8 @@
#include "wafl/wafl2_4_0_0_smn.h"
#include "wafl/wafl2_4_0_0_sh_mask.h"
+#include "amdgpu_reset.h"
+
#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210
#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
@@ -227,6 +229,9 @@ static void amdgpu_xgmi_hive_release(struct kobject *kobj)
struct amdgpu_hive_info *hive = container_of(
kobj, struct amdgpu_hive_info, kobj);
+ amdgpu_reset_put_reset_domain(hive->reset_domain);
+ hive->reset_domain = NULL;
+
mutex_destroy(&hive->hive_lock);
kfree(hive);
}
@@ -398,12 +403,24 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
goto pro_end;
}
- hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0);
- if (!hive->reset_domain.wq) {
- dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n");
- kfree(hive);
- hive = NULL;
- goto pro_end;
+ /**
+ * Avoid recreating reset domain when hive is reconstructed for the case
+ * of reset the devices in the XGMI hive during probe for SRIOV
+ * See https://www.spinics.net/lists/amd-gfx/msg58836.html
+ */
+ if (adev->reset_domain->type != XGMI_HIVE) {
+ hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
+ if (!hive->reset_domain) {
+ dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
+ ret = -ENOMEM;
+ kobject_put(&hive->kobj);
+ kfree(hive);
+ hive = NULL;
+ goto pro_end;
+ }
+ } else {
+ amdgpu_reset_get_reset_domain(adev->reset_domain);
+ hive->reset_domain = adev->reset_domain;
}
hive->hive_id = adev->gmc.xgmi.hive_id;