diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 69 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 15 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 18 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 37 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 33 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 29 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 10 |
7 files changed, 128 insertions, 83 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 493982f94649..2ad2d874f42f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -28,7 +28,7 @@ #define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype} -typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, void *data); +typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); struct aca_banks { int nr_banks; @@ -86,7 +86,7 @@ static void aca_banks_release(struct aca_banks *banks) } } -static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_error_type type, u32 *count) +static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count) { struct amdgpu_aca *aca = &adev->aca; const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; @@ -127,7 +127,7 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); } -static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_error_type type, +static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type, int start, int count, struct aca_banks *banks) { @@ -143,13 +143,12 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro return -EOPNOTSUPP; switch (type) { - case ACA_ERROR_TYPE_UE: + case ACA_SMU_TYPE_UE: max_count = smu_funcs->max_ue_bank_count; break; - case ACA_ERROR_TYPE_CE: + case ACA_SMU_TYPE_CE: max_count = smu_funcs->max_ce_bank_count; break; - case ACA_ERROR_TYPE_DEFERRED: default: return -EINVAL; } @@ -164,6 +163,8 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro if (ret) return ret; + bank.type = type; + aca_smu_bank_dump(adev, i, count, &bank); ret = aca_banks_add_bank(banks, &bank); @@ -195,7 +196,7 @@ static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type t return hwip->hwid == hwid && hwip->mcatype == mcatype; } -static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type) +static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type) { const struct aca_bank_ops *bank_ops = handle->bank_ops; @@ -297,7 +298,7 @@ static int aca_log_errors(struct aca_handle *handle, enum aca_error_type type, } static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank, - enum aca_error_type type, struct aca_bank_report *report) + enum aca_smu_type type, struct aca_bank_report *report) { const struct aca_bank_ops *bank_ops = handle->bank_ops; @@ -313,12 +314,24 @@ static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank * } static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank, - enum aca_error_type type, void *data) + enum aca_smu_type smu_type, void *data) { struct aca_bank_report report; + enum aca_error_type type; int ret; - ret = aca_generate_bank_report(handle, bank, type, &report); + switch (smu_type) { + case ACA_SMU_TYPE_UE: + type = ACA_ERROR_TYPE_UE; + break; + case ACA_SMU_TYPE_CE: + type = ACA_ERROR_TYPE_CE; + break; + default: + return -EINVAL; + } + + ret = aca_generate_bank_report(handle, bank, smu_type, &report); if (ret) return ret; @@ -333,7 +346,7 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank } static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank, - enum aca_error_type type, bank_handler_t handler, void *data) + enum aca_smu_type type, bank_handler_t handler, void *data) { struct aca_handle *handle; int ret; @@ -354,7 +367,7 @@ static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *ba } static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks, - enum aca_error_type type, bank_handler_t handler, void *data) + enum aca_smu_type type, bank_handler_t handler, void *data) { struct aca_bank_node *node; struct aca_bank *bank; @@ -378,7 +391,7 @@ static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks * return 0; } -static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type, +static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, bank_handler_t handler, void *data) { struct amdgpu_aca *aca = &adev->aca; @@ -389,10 +402,6 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type if (list_empty(&aca->mgr.list)) return 0; - /* NOTE: pmfw is only support UE and CE */ - if (type == ACA_ERROR_TYPE_DEFERRED) - type = ACA_ERROR_TYPE_CE; - ret = aca_smu_get_valid_aca_count(adev, type, &count); if (ret) return ret; @@ -479,10 +488,22 @@ out_unlock: static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type, struct ras_err_data *err_data) { + enum aca_smu_type smu_type; int ret; + switch (type) { + case ACA_ERROR_TYPE_UE: + smu_type = ACA_SMU_TYPE_UE; + break; + case ACA_ERROR_TYPE_CE: + smu_type = ACA_SMU_TYPE_CE; + break; + default: + return -EINVAL; + } + /* udpate aca bank to aca source error_cache first */ - ret = aca_banks_update(adev, type, handler_aca_log_bank_error, NULL); + ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, NULL); if (ret) return ret; @@ -784,7 +805,7 @@ static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val) return 0; } -static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_error_type type, int idx) +static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx) { struct aca_bank_info info; int i, ret; @@ -793,7 +814,7 @@ static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_e if (ret) return; - seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_ERROR_TYPE_UE ? "UE" : "CE"); + seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE"); seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n", idx, info.socket_id, info.die_id, info.hwid, info.mcatype); @@ -807,7 +828,7 @@ struct aca_dump_context { }; static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank, - enum aca_error_type type, void *data) + enum aca_smu_type type, void *data) { struct aca_dump_context *ctx = (struct aca_dump_context *)data; @@ -816,7 +837,7 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban return handler_aca_log_bank_error(handle, bank, type, NULL); } -static int aca_dump_show(struct seq_file *m, enum aca_error_type type) +static int aca_dump_show(struct seq_file *m, enum aca_smu_type type) { struct amdgpu_device *adev = (struct amdgpu_device *)m->private; struct aca_dump_context context = { @@ -829,7 +850,7 @@ static int aca_dump_show(struct seq_file *m, enum aca_error_type type) static int aca_dump_ce_show(struct seq_file *m, void *unused) { - return aca_dump_show(m, ACA_ERROR_TYPE_CE); + return aca_dump_show(m, ACA_SMU_TYPE_CE); } static int aca_dump_ce_open(struct inode *inode, struct file *file) @@ -847,7 +868,7 @@ static const struct file_operations aca_ce_dump_debug_fops = { static int aca_dump_ue_show(struct seq_file *m, void *unused) { - return aca_dump_show(m, ACA_ERROR_TYPE_UE); + return aca_dump_show(m, ACA_SMU_TYPE_UE); } static int aca_dump_ue_open(struct inode *inode, struct file *file) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 2da50e095883..a48f4abc345c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -99,7 +99,14 @@ enum aca_error_type { ACA_ERROR_TYPE_COUNT }; +enum aca_smu_type { + ACA_SMU_TYPE_UE = 0, + ACA_SMU_TYPE_CE, + ACA_SMU_TYPE_COUNT, +}; + struct aca_bank { + enum aca_smu_type type; u64 regs[ACA_MAX_REGS_COUNT]; }; @@ -157,9 +164,9 @@ struct aca_handle { }; struct aca_bank_ops { - int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, + int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, struct aca_bank_report *report, void *data); - bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, + bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); }; @@ -167,8 +174,8 @@ struct aca_smu_funcs { int max_ue_bank_count; int max_ce_bank_count; int (*set_debug_mode)(struct amdgpu_device *adev, bool enable); - int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_error_type type, u32 *count); - int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_error_type type, int idx, struct aca_bank *bank); + int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count); + int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank); }; struct amdgpu_aca { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 20d51f6c9bb8..43b1aaa04234 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -1035,12 +1035,12 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) return 0; } -static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, +static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, struct aca_bank_report *report, void *data) { struct amdgpu_device *adev = handle->adev; const char *error_str; - u64 status; + u64 status, count; int ret, ext_error_code; ret = aca_bank_info_decode(bank, &report->info); @@ -1055,9 +1055,17 @@ static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struc if (error_str) dev_info(adev->dev, "%s detected\n", error_str); - if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) || - (type == ACA_ERROR_TYPE_CE && ext_error_code == 6)) - report->count[type] = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]); + count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]); + switch (type) { + case ACA_SMU_TYPE_UE: + report->count[ACA_ERROR_TYPE_UE] = ext_error_code == 0 ? count : 0ULL; + break; + case ACA_SMU_TYPE_CE: + report->count[ACA_ERROR_TYPE_CE] = ext_error_code == 6 ? count : 0ULL; + break; + default: + return -EINVAL; + } return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index b53c8fd4e8cf..bb5be8c4cf11 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -681,37 +681,40 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_3_gfx_funcs = { }; static int gfx_v9_4_3_aca_bank_generate_report(struct aca_handle *handle, - struct aca_bank *bank, enum aca_error_type type, + struct aca_bank *bank, enum aca_smu_type type, struct aca_bank_report *report, void *data) { - u64 status, misc0; + u64 misc0; u32 instlo; int ret; - status = bank->regs[ACA_REG_IDX_STATUS]; - if ((type == ACA_ERROR_TYPE_UE && - ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) || - (type == ACA_ERROR_TYPE_CE && - ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) { + ret = aca_bank_info_decode(bank, &report->info); + if (ret) + return ret; - ret = aca_bank_info_decode(bank, &report->info); - if (ret) - return ret; + /* NOTE: overwrite info.die_id with xcd id for gfx */ + instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); + instlo &= GENMASK(31, 1); + report->info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1; - /* NOTE: overwrite info.die_id with xcd id for gfx */ - instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); - instlo &= GENMASK(31, 1); - report->info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1; + misc0 = bank->regs[ACA_REG_IDX_MISC0]; - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - report->count[type] = ACA_REG__MISC0__ERRCNT(misc0); + switch (type) { + case ACA_SMU_TYPE_UE: + report->count[ACA_ERROR_TYPE_UE] = 1ULL; + break; + case ACA_SMU_TYPE_CE: + report->count[ACA_ERROR_TYPE_CE] = ACA_REG__MISC0__ERRCNT(misc0); + break; + default: + return -EINVAL; } return 0; } static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_error_type type, void *data) + enum aca_smu_type type, void *data) { u32 instlo; diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index c0fc44cdd658..bf83e2c08f04 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -707,24 +707,27 @@ static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = { }; static int mmhub_v1_8_aca_bank_generate_report(struct aca_handle *handle, - struct aca_bank *bank, enum aca_error_type type, + struct aca_bank *bank, enum aca_smu_type type, struct aca_bank_report *report, void *data) { - u64 status, misc0; + u64 misc0; int ret; - status = bank->regs[ACA_REG_IDX_STATUS]; - if ((type == ACA_ERROR_TYPE_UE && - ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) || - (type == ACA_ERROR_TYPE_CE && - ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) { - - ret = aca_bank_info_decode(bank, &report->info); - if (ret) - return ret; - - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - report->count[type] = ACA_REG__MISC0__ERRCNT(misc0); + ret = aca_bank_info_decode(bank, &report->info); + if (ret) + return ret; + + misc0 = bank->regs[ACA_REG_IDX_MISC0]; + + switch (type) { + case ACA_SMU_TYPE_UE: + report->count[ACA_ERROR_TYPE_UE] = 1ULL; + break; + case ACA_SMU_TYPE_CE: + report->count[ACA_ERROR_TYPE_CE] = ACA_REG__MISC0__ERRCNT(misc0); + break; + default: + return -EINVAL; } return 0; @@ -741,7 +744,7 @@ static int mmhub_v1_8_err_codes[] = { }; static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_error_type type, void *data) + enum aca_smu_type type, void *data) { u32 instlo; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 103dc9c7325f..9aa5c190950e 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -2190,24 +2190,27 @@ static const struct amdgpu_ras_block_hw_ops sdma_v4_4_2_ras_hw_ops = { }; static int sdma_v4_4_2_aca_bank_generate_report(struct aca_handle *handle, - struct aca_bank *bank, enum aca_error_type type, + struct aca_bank *bank, enum aca_smu_type type, struct aca_bank_report *report, void *data) { - u64 status, misc0; + u64 misc0; int ret; - status = bank->regs[ACA_REG_IDX_STATUS]; - if ((type == ACA_ERROR_TYPE_UE && - ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) || - (type == ACA_ERROR_TYPE_CE && - ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) { + ret = aca_bank_info_decode(bank, &report->info); + if (ret) + return ret; - ret = aca_bank_info_decode(bank, &report->info); - if (ret) - return ret; + misc0 = bank->regs[ACA_REG_IDX_MISC0]; - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - report->count[type] = ACA_REG__MISC0__ERRCNT(misc0); + switch (type) { + case ACA_SMU_TYPE_UE: + report->count[ACA_ERROR_TYPE_UE] = 1ULL; + break; + case ACA_SMU_TYPE_CE: + report->count[ACA_ERROR_TYPE_CE] = ACA_REG__MISC0__ERRCNT(misc0); + break; + default: + return -EINVAL; } return 0; @@ -2217,7 +2220,7 @@ static int sdma_v4_4_2_aca_bank_generate_report(struct aca_handle *handle, static int sdma_v4_4_2_err_codes[] = { 33, 34, 35, 36 }; static bool sdma_v4_4_2_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_error_type type, void *data) + enum aca_smu_type type, void *data) { u32 instlo; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 4a02e1f041da..7b841431344f 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -504,7 +504,7 @@ const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = { .query_ras_error_address = umc_v12_0_query_ras_error_address, }; -static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, +static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, struct aca_bank_report *report, void *data) { struct amdgpu_device *adev = handle->adev; @@ -517,14 +517,14 @@ static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct status = bank->regs[ACA_REG_IDX_STATUS]; switch (type) { - case ACA_ERROR_TYPE_UE: + case ACA_SMU_TYPE_UE: if (umc_v12_0_is_uncorrectable_error(adev, status)) { - report->count[type] = 1; + report->count[ACA_ERROR_TYPE_UE] = 1; } break; - case ACA_ERROR_TYPE_CE: + case ACA_SMU_TYPE_CE: if (umc_v12_0_is_correctable_error(adev, status)) { - report->count[type] = 1; + report->count[ACA_ERROR_TYPE_CE] = 1; } break; default: |