diff options
author | YiPeng Chai <YiPeng.Chai@amd.com> | 2024-03-11 17:02:40 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-04-26 17:22:41 -0400 |
commit | a734adfbcdb0e8c382fc41d3fe8b7d194c6535f6 (patch) | |
tree | 37a4997fe0f06222bf62196d1141aa8fd25b631e /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | f493dd64ee6680dc5bb46d7c800346eadb18049a (diff) |
drm/amdgpu: add poison creation handler
Add poison creation handler.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 76 |
1 files changed, 69 insertions, 7 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index fb2f88005cf3..66186e28ab92 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2076,6 +2076,17 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj { dev_info(obj->adev->dev, "Poison is created\n"); + + if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { + struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); + + amdgpu_ras_put_poison_req(obj->adev, + AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false); + + atomic_inc(&con->page_retirement_req_cnt); + + wake_up(&con->page_retirement_wq); + } } static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, @@ -2727,7 +2738,6 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, return 0; } -#ifdef PRE_DEFINED_FUNCTION static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev, struct ras_poison_msg *poison_msg) { @@ -2735,7 +2745,6 @@ static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev, return kfifo_get(&con->poison_fifo, poison_msg); } -#endif static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) { @@ -2766,10 +2775,54 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) mutex_destroy(&ecc_log->lock); ecc_log->de_updated = false; } + +static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev, + enum amdgpu_ras_block ras_block, uint32_t timeout_ms) +{ + int ret = 0; + struct ras_ecc_log_info *ecc_log; + struct ras_query_if info; + uint32_t timeout = timeout_ms; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + memset(&info, 0, sizeof(info)); + info.head.block = ras_block; + + ecc_log = &ras->umc_ecc_log; + ecc_log->de_updated = false; + do { + ret = amdgpu_ras_query_error_status(adev, &info); + if (ret) { + dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret); + return ret; + } + + if (timeout && !ecc_log->de_updated) { + msleep(1); + timeout--; + } + } while (timeout && !ecc_log->de_updated); + + if (timeout_ms && !timeout) { + dev_warn(adev->dev, "Can't find deferred error\n"); + return -ETIMEDOUT; + } + + return 0; +} + +static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, + uint32_t timeout) +{ + amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout); +} + static int amdgpu_ras_page_retirement_thread(void *param) { struct amdgpu_device *adev = (struct amdgpu_device *)param; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_poison_msg poison_msg; + enum amdgpu_ras_block ras_block; while (!kthread_should_stop()) { @@ -2780,13 +2833,22 @@ static int amdgpu_ras_page_retirement_thread(void *param) if (kthread_should_stop()) break; - dev_info(adev->dev, "Start processing page retirement. request:%d\n", - atomic_read(&con->page_retirement_req_cnt)); - atomic_dec(&con->page_retirement_req_cnt); - amdgpu_umc_bad_page_polling_timeout(adev, - 0, MAX_UMC_POISON_POLLING_TIME_ASYNC); + if (!amdgpu_ras_get_poison_req(adev, &poison_msg)) + continue; + + ras_block = poison_msg.block; + + dev_info(adev->dev, "Start processing ras block %s(%d)\n", + ras_block_str(ras_block), ras_block); + + if (ras_block == AMDGPU_RAS_BLOCK__UMC) + amdgpu_ras_poison_creation_handler(adev, + MAX_UMC_POISON_POLLING_TIME_ASYNC); + else + amdgpu_umc_bad_page_polling_timeout(adev, + false, MAX_UMC_POISON_POLLING_TIME_ASYNC); } return 0; |