diff options
author | Farah Kassabri <fkassabri@habana.ai> | 2023-10-31 12:20:36 +0200 |
---|---|---|
committer | Oded Gabbay <ogabbay@kernel.org> | 2023-12-19 11:09:43 +0200 |
commit | d1958dce5ab6a3e089c60cf474e8c9b7e96e70ad (patch) | |
tree | c0c26a21e465d4c204ac186a114220b8be7e9a50 /drivers/accel | |
parent | 42422993cf28d456778ee9168d73758ec037cd51 (diff) |
accel/habanalabs: fix EQ heartbeat mechanism
Stop rescheduling another heartbeat check when EQ heartbeat check fails
as it generates confusing logs in dmesg that the heartbeat fails.
Signed-off-by: Farah Kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/accel')
-rw-r--r-- | drivers/accel/habanalabs/common/device.c | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index d9447aeb3937..6bf5f1d0d005 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1044,20 +1044,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev) return (vendor_id == PCI_VENDOR_ID_HABANALABS); } -static void hl_device_eq_heartbeat(struct hl_device *hdev) +static int hl_device_eq_heartbeat_check(struct hl_device *hdev) { - u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; struct asic_fixed_properties *prop = &hdev->asic_prop; if (!prop->cpucp_info.eq_health_check_supported) - return; + return 0; if (hdev->eq_heartbeat_received) { hdev->eq_heartbeat_received = false; } else { dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); - hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask); + return -EIO; } + + return 0; } static void hl_device_heartbeat(struct work_struct *work) @@ -1074,10 +1075,9 @@ static void hl_device_heartbeat(struct work_struct *work) /* * For EQ health check need to check if driver received the heartbeat eq event * in order to validate the eq is working. + * Only if both the EQ is healthy and we managed to send the next heartbeat reschedule. */ - hl_device_eq_heartbeat(hdev); - - if (!hdev->asic_funcs->send_heartbeat(hdev)) + if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev))) goto reschedule; if (hl_device_operational(hdev, NULL)) |