summaryrefslogtreecommitdiff
path: root/drivers/gpu
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c25
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h5
7 files changed, 55 insertions, 8 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 183e219b6a85..b27336a05aae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5069,7 +5069,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive = NULL;
if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
- amdgpu_virt_ready_to_reset(adev);
+ if (!amdgpu_ras_get_fed_status(adev))
+ amdgpu_virt_ready_to_reset(adev);
amdgpu_virt_wait_reset(adev);
clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
r = amdgpu_virt_request_full_gpu(adev, true);
@@ -5837,6 +5838,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
+ if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
+ dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
+ amdgpu_ras_set_fed(adev, true);
+ set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
+ }
+
r = amdgpu_device_reset_sriov(adev, reset_context);
if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
amdgpu_virt_release_full_gpu(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 63f2286858c4..ccb3d041c2b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -229,6 +229,22 @@ void amdgpu_virt_free_mm_table(struct amdgpu_device *adev)
adev->virt.mm_table.gpu_addr = 0;
}
+/**
+ * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt
+ * @adev: amdgpu device.
+ * Check whether host sent RAS error message
+ * Return: true if found, otherwise false
+ */
+bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev)
+{
+ struct amdgpu_virt *virt = &adev->virt;
+
+ if (!virt->ops || !virt->ops->rcvd_ras_intr)
+ return false;
+
+ return virt->ops->rcvd_ras_intr(adev);
+}
+
unsigned int amd_sriov_msg_checksum(void *obj,
unsigned long obj_size,
@@ -612,11 +628,14 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
ret = amdgpu_virt_read_pf2vf_data(adev);
if (ret) {
adev->virt.vf2pf_update_retry_cnt++;
- if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
- amdgpu_sriov_runtime(adev)) {
+
+ if ((amdgpu_virt_rcvd_ras_interrupt(adev) ||
+ adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
+ amdgpu_sriov_runtime(adev)) {
+
amdgpu_ras_set_fed(adev, true);
if (amdgpu_reset_domain_schedule(adev->reset_domain,
- &adev->kfd.reset_work))
+ &adev->kfd.reset_work))
return;
else
dev_err(adev->dev, "Failed to queue work! at %s", __func__);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index f04cd1586c72..b42a8854dca0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -52,7 +52,7 @@
/* tonga/fiji use this offset */
#define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
-#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5
+#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
enum amdgpu_sriov_vf_mode {
SRIOV_VF_MODE_BARE_METAL = 0,
@@ -94,6 +94,7 @@ struct amdgpu_virt_ops {
u32 data1, u32 data2, u32 data3);
void (*ras_poison_handler)(struct amdgpu_device *adev,
enum amdgpu_ras_block block);
+ bool (*rcvd_ras_intr)(struct amdgpu_device *adev);
};
/*
@@ -352,6 +353,7 @@ void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev);
int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
+bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev);
void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev);
void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev);
void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 65656afc6ed1..f5411b798e11 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -408,6 +408,13 @@ static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
}
+static bool xgpu_ai_rcvd_ras_intr(struct amdgpu_device *adev)
+{
+ enum idh_event msg = xgpu_ai_mailbox_peek_msg(adev);
+
+ return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
+}
+
const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
.req_full_gpu = xgpu_ai_request_full_gpu_access,
.rel_full_gpu = xgpu_ai_release_full_gpu_access,
@@ -417,4 +424,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
.trans_msg = xgpu_ai_mailbox_trans_msg,
.req_init_data = xgpu_ai_request_init_data,
.ras_poison_handler = xgpu_ai_ras_poison_handler,
+ .rcvd_ras_intr = xgpu_ai_rcvd_ras_intr,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index c520b2fabfb9..ed57cbc150af 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -51,7 +51,9 @@ enum idh_event {
IDH_FAIL,
IDH_QUERY_ALIVE,
IDH_REQ_GPU_INIT_DATA_READY,
-
+ IDH_RAS_POISON_READY,
+ IDH_PF_SOFT_FLR_NOTIFICATION,
+ IDH_RAS_ERROR_DETECTED,
IDH_TEXT_MESSAGE = 255,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 17e1e8cc2437..f47bd7ada4d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -449,6 +449,13 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
}
}
+static bool xgpu_nv_rcvd_ras_intr(struct amdgpu_device *adev)
+{
+ enum idh_event msg = xgpu_nv_mailbox_peek_msg(adev);
+
+ return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
+}
+
const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.req_full_gpu = xgpu_nv_request_full_gpu_access,
.rel_full_gpu = xgpu_nv_release_full_gpu_access,
@@ -458,4 +465,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.wait_reset = xgpu_nv_wait_reset,
.trans_msg = xgpu_nv_mailbox_trans_msg,
.ras_poison_handler = xgpu_nv_ras_poison_handler,
+ .rcvd_ras_intr = xgpu_nv_rcvd_ras_intr,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
index 1e8fd90cab43..caf616a2c8a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
@@ -26,7 +26,7 @@
#define NV_MAILBOX_POLL_ACK_TIMEDOUT 500
#define NV_MAILBOX_POLL_MSG_TIMEDOUT 6000
-#define NV_MAILBOX_POLL_FLR_TIMEDOUT 5000
+#define NV_MAILBOX_POLL_FLR_TIMEDOUT 10000
#define NV_MAILBOX_POLL_MSG_REP_MAX 11
enum idh_request {
@@ -52,7 +52,8 @@ enum idh_event {
IDH_QUERY_ALIVE,
IDH_REQ_GPU_INIT_DATA_READY,
IDH_RAS_POISON_READY,
-
+ IDH_PF_SOFT_FLR_NOTIFICATION,
+ IDH_RAS_ERROR_DETECTED,
IDH_TEXT_MESSAGE = 255,
};