diff options
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 89 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 |
2 files changed, 71 insertions, 21 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8dcef433cb04..d06beb884a16 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1136,11 +1136,54 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, } /** - * amdgpu_ras_query_error_count -- Get error counts of all IPs + * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP + * @adev: pointer to AMD GPU device + * @ce_count: pointer to an integer to be set to the count of correctible errors. + * @ue_count: pointer to an integer to be set to the count of uncorrectible errors. + * @query_info: pointer to ras_query_if + * + * Return 0 for query success or do nothing, otherwise return an error + * on failures + */ +static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev, + unsigned long *ce_count, + unsigned long *ue_count, + struct ras_query_if *query_info) +{ + int ret; + + if (!query_info) + /* do nothing if query_info is not specified */ + return 0; + + ret = amdgpu_ras_query_error_status(adev, query_info); + if (ret) + return ret; + + *ce_count += query_info->ce_count; + *ue_count += query_info->ue_count; + + /* some hardware/IP supports read to clear + * no need to explictly reset the err status after the query call */ + if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) + dev_warn(adev->dev, + "Failed to reset error counter and error status\n"); + } + + return 0; +} + +/** + * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP * @adev: pointer to AMD GPU device * @ce_count: pointer to an integer to be set to the count of correctible errors. * @ue_count: pointer to an integer to be set to the count of uncorrectible * errors. + * @query_info: pointer to ras_query_if if the query request is only for + * specific ip block; if info is NULL, then the qurey request is for + * all the ip blocks that support query ras error counters/status * * If set, @ce_count or @ue_count, count and return the corresponding * error counts in those integer pointers. Return 0 if the device @@ -1148,11 +1191,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, */ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, unsigned long *ce_count, - unsigned long *ue_count) + unsigned long *ue_count, + struct ras_query_if *query_info) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; unsigned long ce, ue; + int ret; if (!adev->ras_enabled || !con) return -EOPNOTSUPP; @@ -1164,26 +1209,23 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, ce = 0; ue = 0; - list_for_each_entry(obj, &con->head, node) { - struct ras_query_if info = { - .head = obj->head, - }; - int res; - - res = amdgpu_ras_query_error_status(adev, &info); - if (res) - return res; + if (!query_info) { + /* query all the ip blocks that support ras query interface */ + list_for_each_entry(obj, &con->head, node) { + struct ras_query_if info = { + .head = obj->head, + }; - if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && - adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { - if (amdgpu_ras_reset_error_status(adev, info.head.block)) - dev_warn(adev->dev, "Failed to reset error counter and error status"); + ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info); } - - ce += info.ce_count; - ue += info.ue_count; + } else { + /* query specific ip block */ + ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info); } + if (ret) + return ret; + if (ce_count) *ce_count = ce; @@ -2411,7 +2453,7 @@ static void amdgpu_ras_counte_dw(struct work_struct *work) /* Cache new values. */ - if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) { atomic_set(&con->ras_ce_count, ce_count); atomic_set(&con->ras_ue_count, ue_count); } @@ -2592,6 +2634,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, { struct amdgpu_ras_block_object *ras_obj = NULL; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_query_if *query_info; unsigned long ue_count, ce_count; int r; @@ -2633,11 +2676,17 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, /* Those are the cached values at init. */ - if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { + query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL); + if (!query_info) + return -ENOMEM; + memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); + + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { atomic_set(&con->ras_ce_count, ce_count); atomic_set(&con->ras_ue_count, ue_count); } + kfree(query_info); return 0; interrupt: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index bf5a95104ec1..f2ad999993f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -540,7 +540,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev); int amdgpu_ras_query_error_count(struct amdgpu_device *adev, unsigned long *ce_count, - unsigned long *ue_count); + unsigned long *ue_count, + struct ras_query_if *query_info); /* error handling functions */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, |